#Para limpiar la data

rm(list = ls()) 

#activar paqueterias

library(rio)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Se importa la data

REALDATA= import("REALDATAOF.xlsx")

#procedemos a ver el tipo de data que tenemos

str(REALDATA)
## 'data.frame':    196 obs. of  10 variables:
##  $ PROVINCIA               : chr  "ABANCAY" "ACOBAMBA" "ACOMAYO" "AIJA" ...
##  $ DEPARTAMENTO            : chr  "APURÍMAC" "HUANCAVELICA" "CUSCO" "ÁNCASH" ...
##  $ CANON                   : chr  "27116046" "4401431" "35544103" "10645792" ...
##  $ POBLACION               : num  121501 34179 23732 6170 155236 ...
##  $ NOMBRE_OP               : chr  "ALIANZAPARAELPROGRESO" "MOVIMIENTOREGIONALAGUA" "MOVIMIENTOREGIONALINKAPACHAKUTEQ" "ALIANZAGOBIERNOUNIDADYACCION-AGUA" ...
##  $ TIPO_OP                 : chr  "PARTIDOPOLÍTICO" "MOVIMIENTOREGIONAL" "MOVIMIENTOREGIONAL" "MOVIMIENTOREGIONAL" ...
##  $ PUESTO                  : num  19 96 184 129 51 67 53 181 133 193 ...
##  $ INCO2022                : num  56.9 40.2 22.8 35.7 46.6 44.1 46.4 25.1 35.1 18.5 ...
##  $ CANT_TRABAJADOR         : num  524 178 109 9 152 218 370 279 657 52 ...
##  $ 
## Hogares_dependientes: num  606 828 364 53 1663 ...

#A recomendación de la JP debemos elegir entre

REALDATA = select(REALDATA, -PUESTO)

#Analisis para evaluar datos perdidos

analysis <- any(is.na(REALDATA$Hogares_dependientes))
analysis <- any(is.na(REALDATA$POBLACION))
analysis <- any(is.na(REALDATA$CANT_TRABAJADOR))
analysis <- any(is.na(REALDATA))

#Eliminar espacios de los numeros porque estorban

REALDATA$CANON <- gsub("\\s+", "", REALDATA$CANON)

#Entonces como no hay datos perdidos, procedemos a cambiar el formato de las variables de character a numerico

REALDATA$CANON <- as.numeric( REALDATA$CANON)

#Procedemos a verificar si las variables se pasaron a numericas

str(REALDATA)
## 'data.frame':    196 obs. of  9 variables:
##  $ PROVINCIA               : chr  "ABANCAY" "ACOBAMBA" "ACOMAYO" "AIJA" ...
##  $ DEPARTAMENTO            : chr  "APURÍMAC" "HUANCAVELICA" "CUSCO" "ÁNCASH" ...
##  $ CANON                   : num  27116046 4401431 35544103 10645792 6454423 ...
##  $ POBLACION               : num  121501 34179 23732 6170 155236 ...
##  $ NOMBRE_OP               : chr  "ALIANZAPARAELPROGRESO" "MOVIMIENTOREGIONALAGUA" "MOVIMIENTOREGIONALINKAPACHAKUTEQ" "ALIANZAGOBIERNOUNIDADYACCION-AGUA" ...
##  $ TIPO_OP                 : chr  "PARTIDOPOLÍTICO" "MOVIMIENTOREGIONAL" "MOVIMIENTOREGIONAL" "MOVIMIENTOREGIONAL" ...
##  $ INCO2022                : num  56.9 40.2 22.8 35.7 46.6 44.1 46.4 25.1 35.1 18.5 ...
##  $ CANT_TRABAJADOR         : num  524 178 109 9 152 218 370 279 657 52 ...
##  $ 
## Hogares_dependientes: num  606 828 364 53 1663 ...

#Procedemos a evaluar los datos descriptivos

summary(REALDATA)
##   PROVINCIA         DEPARTAMENTO           CANON             POBLACION       
##  Length:196         Length:196         Min.   :7.000e+00   Min.   :    3270  
##  Class :character   Class :character   1st Qu.:3.185e+06   1st Qu.:   28765  
##  Mode  :character   Mode  :character   Median :1.207e+07   Median :   61863  
##                                        Mean   :4.873e+07   Mean   :  170391  
##                                        3rd Qu.:3.719e+07   3rd Qu.:  121460  
##                                        Max.   :1.607e+09   Max.   :10004141  
##   NOMBRE_OP           TIPO_OP             INCO2022     CANT_TRABAJADOR 
##  Length:196         Length:196         Min.   :17.00   Min.   :   5.0  
##  Class :character   Class :character   1st Qu.:33.48   1st Qu.:  66.0  
##  Mode  :character   Mode  :character   Median :39.90   Median : 151.0  
##                                        Mean   :40.57   Mean   : 334.8  
##                                        3rd Qu.:46.80   3rd Qu.: 369.2  
##                                        Max.   :69.10   Max.   :6524.0  
##  \r\nHogares_dependientes
##  Min.   :    9.0         
##  1st Qu.:  257.0         
##  Median :  609.5         
##  Mean   :  948.7         
##  3rd Qu.: 1151.8         
##  Max.   :12286.0

#Instalamos mas paqueterias

library(knitr)
library(modelsummary)
## Version 2.0.0 of `modelsummary`, to be released soon, will introduce a
##   breaking change: The default table-drawing package will be `tinytable`
##   instead of `kableExtra`. All currently supported table-drawing packages
##   will continue to be supported for the foreseeable future, including
##   `kableExtra`, `gt`, `huxtable`, `flextable, and `DT`.
##   
##   You can always call the `config_modelsummary()` function to change the
##   default table-drawing package in persistent fashion. To try `tinytable`
##   now:
##   
##   config_modelsummary(factory_default = 'tinytable')
##   
##   To set the default back to `kableExtra`:
##   
##   config_modelsummary(factory_default = 'kableExtra')
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

#RECIEN NOS DIMOS CUENTA DE QUE EL NOMBRE DE HOGARES DEPENDIENTES TENIA ESE NOMBRE RARO, PERO YA LO ARREGLAMOS :)

REALDATA <- REALDATA %>%
  rename(HOGARES_DEP = "\r\nHogares_dependientes")

#INICIO DEL ANALISIS ESTADISTICO #SE PROCEDE A HACER LA REGRESION LINEAL DE POSISSON

#tabla para evaluar si hay correlacion entre las variables

library(ggcorrplot)
## Loading required package: ggplot2
colNums=names(REALDATA)[c(3,4,7,8,9)]
numXs=REALDATA[,colNums]
ggcorrplot(cor(numXs),lab = T,show.diag = F)

#Primero se hace un modelo de regresion lineal entre la variable canon y la variable dependiente indice de corrupcion, considerando la variable de control. Como resultado se evidencia de que existe significancia en su correlacion, es decir, si afecta, sin embargo, el valor no es tan alto, sin embargo, se puede decir que existe una influencia significativa.

modelo1=formula(INCO2022~CANON+POBLACION)
reg1=lm(modelo1,data=REALDATA)
summary(reg1)
## 
## Call:
## lm(formula = modelo1, data = REALDATA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.0338  -6.4218   0.3142   5.6206  27.6545 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.899e+01  7.960e-01  48.973  < 2e-16 ***
## CANON       1.979e-08  5.134e-09   3.854 0.000158 ***
## POBLACION   3.626e-06  1.013e-06   3.580 0.000434 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.33 on 193 degrees of freedom
## Multiple R-squared:  0.1298, Adjusted R-squared:  0.1208 
## F-statistic:  14.4 on 2 and 193 DF,  p-value: 1.488e-06
model1=list('apropiacion (I)'=reg1)
modelsummary(model1, title = "Regresion: modelo 1",
             stars = TRUE,
             output = "kableExtra")
Regresion: modelo 1
 apropiacion (I)
(Intercept) 38.985***
(0.796)
CANON 0.000***
(0.000)
POBLACION 0.000***
(0.000)
Num.Obs. 196
R2 0.130
R2 Adj. 0.121
AIC 1476.6
BIC 1489.7
Log.Lik. -734.312
F 14.396
RMSE 10.25
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

#Ahora realizamos el segundo modelo, añadiendo a la variable hogares dependientes

modelo2=formula(INCO2022~CANON+HOGARES_DEP+POBLACION)
reg2=lm(modelo2,data=REALDATA)
summary(reg2)
## 
## Call:
## lm(formula = modelo2, data = REALDATA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.5068  -6.2816   0.4413   5.1163  24.3224 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.597e+01  9.594e-01  37.487  < 2e-16 ***
## CANON        1.908e-08  4.839e-09   3.943 0.000113 ***
## HOGARES_DEP  4.183e-03  8.290e-04   5.046 1.04e-06 ***
## POBLACION   -1.742e-06  1.429e-06  -1.219 0.224273    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.734 on 192 degrees of freedom
## Multiple R-squared:  0.2317, Adjusted R-squared:  0.2197 
## F-statistic:  19.3 on 3 and 192 DF,  p-value: 5.577e-11
model2=list('apropiacion (II)'=reg2)
modelsummary(model2, title = "Regresion: modelo 2",
             stars = TRUE,
             output = "kableExtra")
Regresion: modelo 2
 apropiacion (II)
(Intercept) 35.966***
(0.959)
CANON 0.000***
(0.000)
HOGARES_DEP 0.004***
(0.001)
POBLACION 0.000
(0.000)
Num.Obs. 196
R2 0.232
R2 Adj. 0.220
AIC 1454.2
BIC 1470.6
Log.Lik. -722.110
F 19.300
RMSE 9.63
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

#Para hacer el modelo 3, primero debemos volver a factor la variable TIPO DE Organizacion politica. Donde si es un MOVIMIENTO REGIONAL ES “1”, si NO es un MOVIMIENTO REGIONAL es “0”

REALDATA$TIPO_OP <- ifelse(REALDATA$TIPO_OP  == "MOVIMIENTOREGIONAL", 1, 0)
modelo3=formula(INCO2022~CANON+HOGARES_DEP+TIPO_OP+POBLACION)
reg3=lm(modelo3,data=REALDATA)
summary(reg3)
## 
## Call:
## lm(formula = modelo3, data = REALDATA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.4899  -6.5979  -0.1616   4.8370  24.7367 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.825e+01  1.267e+00  30.197  < 2e-16 ***
## CANON        1.755e-08  4.794e-09   3.661 0.000325 ***
## HOGARES_DEP  4.168e-03  8.157e-04   5.110 7.79e-07 ***
## TIPO_OP     -3.793e+00  1.401e+00  -2.708 0.007378 ** 
## POBLACION   -1.588e-06  1.407e-06  -1.128 0.260525    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.577 on 191 degrees of freedom
## Multiple R-squared:  0.2601, Adjusted R-squared:  0.2446 
## F-statistic: 16.79 on 4 and 191 DF,  p-value: 8.281e-12
model3=list('apropiacion (III)'=reg3)
modelsummary(model3, title = "Regresion: modelo 3",
             stars = TRUE,
             output = "kableExtra")
Regresion: modelo 3
 apropiacion (III)
(Intercept) 38.254***
(1.267)
CANON 0.000***
(0.000)
HOGARES_DEP 0.004***
(0.001)
TIPO_OP -3.793**
(1.401)
POBLACION 0.000
(0.000)
Num.Obs. 196
R2 0.260
R2 Adj. 0.245
AIC 1448.8
BIC 1468.5
Log.Lik. -718.417
RMSE 9.45
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

#Aqui se puede detectar que cuando no es un movimiento regional, aumenta la probabilidad de que haya mas corrupcion

modelo3_st=formula(scale(INCO2022)~scale(CANON)+scale(as.numeric(TIPO_OP))+scale(as.numeric(HOGARES_DEP))+scale(POBLACION))

modelo3_st=lm(modelo3_st,data=REALDATA)

modelo3_st=list('Corrupcion (III_st)'=modelo3_st)
modelsummary(modelo3_st, title = "Regresion: modelo 3 con \ncoeficientes estandarizados",
             stars = TRUE,
             output = "kableExtra")
Regresion: modelo 3 con coeficientes estandarizados
 Corrupcion (III_st)
(Intercept) 0.000
(0.062)
scale(CANON) 0.230***
(0.063)
scale(as.numeric(TIPO_OP)) -0.170**
(0.063)
scale(as.numeric(HOGARES_DEP)) 0.477***
(0.093)
scale(POBLACION) -0.105
(0.093)
Num.Obs. 196
R2 0.260
R2 Adj. 0.245
AIC 508.2
BIC 527.8
Log.Lik. -248.089
F 16.786
RMSE 0.86
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3)
modelsummary(models, title = "Resultados de todos los modelos de corrupcion",
             stars = TRUE,
             output = "kableExtra")
Resultados de todos los modelos de corrupcion
 apropiacion (I)  apropiacion (II)  apropiacion (III)
(Intercept) 38.985*** 35.966*** 38.254***
(0.796) (0.959) (1.267)
CANON 0.000*** 0.000*** 0.000***
(0.000) (0.000) (0.000)
POBLACION 0.000*** 0.000 0.000
(0.000) (0.000) (0.000)
HOGARES_DEP 0.004*** 0.004***
(0.001) (0.001)
TIPO_OP -3.793**
(1.401)
Num.Obs. 196 196 196
R2 0.130 0.232 0.260
R2 Adj. 0.121 0.220 0.245
AIC 1476.6 1454.2 1448.8
BIC 1489.7 1470.6 1468.5
Log.Lik. -734.312 -722.110 -718.417
F 14.396 19.300
RMSE 10.25 9.63 9.45
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3)
modelsummary(models, title = "Resultados de todos los modelos",statistic = "conf.int",
             stars = TRUE,
             output = "kableExtra")
Resultados de todos los modelos
 apropiacion (I)  apropiacion (II)  apropiacion (III)
(Intercept) 38.985*** 35.966*** 38.254***
[37.415, 40.555] [34.073, 37.858] [35.755, 40.752]
CANON 0.000*** 0.000*** 0.000***
[0.000, 0.000] [0.000, 0.000] [0.000, 0.000]
POBLACION 0.000*** 0.000 0.000
[0.000, 0.000] [0.000, 0.000] [0.000, 0.000]
HOGARES_DEP 0.004*** 0.004***
[0.003, 0.006] [0.003, 0.006]
TIPO_OP -3.793**
[-6.556, -1.031]
Num.Obs. 196 196 196
R2 0.130 0.232 0.260
R2 Adj. 0.121 0.220 0.245
AIC 1476.6 1454.2 1448.8
BIC 1489.7 1470.6 1468.5
Log.Lik. -734.312 -722.110 -718.417
F 14.396 19.300
RMSE 10.25 9.63 9.45
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
library(ggplot2)
library(sjPlot)
plot_models(reg1,reg2,reg3,vline.color = "pink",m.labels=c("Modelo 1","Modelo 2","Modelo 3"),dot.size = 1,line.size = 0.6)

#Despues de elaborar este trascendental y euforico grafico, se procede a realizar la comparacion de modelos anidados a traves de anova test

library(magrittr)
library(knitr)
tanova=anova(reg1,reg2,reg3)

kable(tanova,
      caption = "Tabla ANOVA para comparar modelos")%>%kableExtra::kable_styling(full_width = FALSE)
Tabla ANOVA para comparar modelos
Res.Df RSS Df Sum of Sq F Pr(>F)
193 20603.56 NA NA NA NA
192 18191.43 1 2412.1311 26.298593 0.0000007
191 17518.70 1 672.7306 7.334538 0.0073784

#Como respuesta se puede decir, que el modelo elegido es el 3 uwu

#Ahora procederemos a realizar la prueba de regresion lineal teniendo en cuenta los 5 principios :)

1 PRINCIPIO DE LINEALIDAD

plot(reg3, 1)

2 HOMOCEDASTICIDAD

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(kableExtra)
resBP=bptest(reg3)
data.frame(list('BP'=resBP$statistic,
             'df'=resBP$parameter,
             "p-value"=resBP$p.value))%>%
    kable(caption = resBP$method)%>%kable_styling(full_width = F)
studentized Breusch-Pagan test
BP df p.value
BP 8.913159 4 0.0633071
plot(reg3, 3)

#ESTA BIEN PORQUE ES MAYOR A 0.05

3 Normalidad de Residuos

plot(reg3, 2)

resSW=shapiro.test(reg3$residuals)
data.frame(list('SW'=resSW$statistic,
             "p-value"=resSW$p.value))%>%
    kable(caption = resSW$method)%>%kable_styling(full_width = F)
Shapiro-Wilk normality test
SW p.value
W 0.9845988 0.0306256

#No hay normalidad de residuos

4 NO Multicolinealidad

library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:modelsummary':
## 
##     Format, Mean, Median, N, SD, Var
VIF(reg3) %>%kable(col.names = "VIF",caption ="Evaluando Multicolinealidad usando VIF (Variance Inflation Factors)" )%>%kable_styling(full_width = F)
Evaluando Multicolinealidad usando VIF (Variance Inflation Factors)
VIF
CANON 1.016448
HOGARES_DEP 2.248863
TIPO_OP 1.016514
POBLACION 2.250593

#esta bien porque es menor a 3

5 VALORES INFLUYENTES

plot(reg3, 5)

checkReg3=as.data.frame(influence.measures(reg3)$is.inf)
checkReg3[checkReg3$cook.d & checkReg3$hat,c('cook.d','hat')]%>%kable(caption = "Valores Influyentes criticos")%>%kable_styling(full_width = F)
Valores Influyentes criticos
cook.d hat
105 TRUE TRUE
113 TRUE TRUE

#Debido a que el dato 105 y 113 son grandes, La Convencion y Lima, se va a eliminar estos datos del analisis de la regresion

data_sin_ti= REALDATA[!REALDATA$PROVINCIA=='LIMA',]
data_sin_ti= data_sin_ti [!data_sin_ti$PROVINCIA=='LACONVENCION',]

#Ahora procederemos ha rehacer el modelo 3, esta vez con los datos de Lima y La Convencion

reg4=lm(modelo3,data=data_sin_ti)
summary(reg4)
## 
## Call:
## lm(formula = modelo3, data = data_sin_ti)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2427  -5.4484   0.1942   4.5821  24.7621 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.697e+01  1.141e+00  32.401  < 2e-16 ***
## CANON        2.311e-08  7.085e-09   3.262  0.00131 ** 
## HOGARES_DEP  1.928e-03  8.052e-04   2.394  0.01763 *  
## TIPO_OP     -3.531e+00  1.241e+00  -2.844  0.00494 ** 
## POBLACION    2.404e-05  4.272e-06   5.627 6.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.484 on 189 degrees of freedom
## Multiple R-squared:  0.4165, Adjusted R-squared:  0.4042 
## F-statistic: 33.73 on 4 and 189 DF,  p-value: < 2.2e-16

#todos los valores son significativos. Pero debe recalcarse que HOGARES_DEP tiene mejor influencia de 0.01

#Ahora estandarizamos el modelo 4

modelo4_st=formula(scale(INCO2022)~scale(CANON)+scale(as.numeric(TIPO_OP))+scale(as.numeric(HOGARES_DEP))+scale(POBLACION))

modelo4_st=lm(modelo4_st,data=data_sin_ti)

modelo4_st=list('Corrupcion (IV_st)'=modelo4_st)
modelsummary(modelo4_st, title = "Regresion: modelo 4 con \ncoeficientes estandarizados",
             stars = TRUE,
             output = "kableExtra")
Regresion: modelo 4 con coeficientes estandarizados
 Corrupcion (IV_st)
(Intercept) 0.000
(0.055)
scale(CANON) 0.192**
(0.059)
scale(as.numeric(TIPO_OP)) -0.159**
(0.056)
scale(as.numeric(HOGARES_DEP)) 0.170*
(0.071)
scale(POBLACION) 0.417***
(0.074)
Num.Obs. 194
R2 0.417
R2 Adj. 0.404
AIC 457.0
BIC 476.6
Log.Lik. -222.517
F 33.727
RMSE 0.76
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

1 PRINCIPIO DE LINEALIDAD

plot(reg4, 1)

2 HOMOCEDASTICIDAD

resBP=bptest(reg4)
data.frame(list('BP'=resBP$statistic,
             'df'=resBP$parameter,
             "p-value"=resBP$p.value))%>%
    kable(caption = resBP$method)%>%kable_styling(full_width = F)
studentized Breusch-Pagan test
BP df p.value
BP 17.07223 4 0.0018715

#Es menor a 0.05, es heterocedastico

3 Normalidad de Residuos

plot(reg4, 2)

resSW=shapiro.test(reg4$residuals)
data.frame(list('SW'=resSW$statistic,
             "p-value"=resSW$p.value))%>%
    kable(caption = resSW$method)%>%kable_styling(full_width = F)
Shapiro-Wilk normality test
SW p.value
W 0.9917033 0.3339147

#si es mayor a 0.05

4 NO Multicolinealidad

VIF(reg4) %>%kable(col.names = "VIF",caption ="Evaluando Multicolinealidad usando VIF (Variance Inflation Factors)" )%>%kable_styling(full_width = F)
Evaluando Multicolinealidad usando VIF (Variance Inflation Factors)
VIF
CANON 1.125481
HOGARES_DEP 1.627534
TIPO_OP 1.006356
POBLACION 1.779173

#Estas okey, porque es menor a 3

5 VALORES INFLUYENTES

plot(reg4, 5)

checkReg4=as.data.frame(influence.measures(reg4)$is.inf)
checkReg4[checkReg3$cook.d & checkReg4$hat,c('cook.d','hat')]%>%kable(caption = "Valores Influyentes criticos")%>%kable_styling(full_width = F)
## Warning in checkReg3$cook.d & checkReg4$hat: longitud de objeto mayor no es
## múltiplo de la longitud de uno menor
Valores Influyentes criticos
cook.d hat
NA NA
:—— :—

#UNA VEZ QUE YA REALIZAMOS ESTA REGRESION LINEAL, SE PUEDE PROCEDER A REALIZAR UN ANALISIS MAS COMPLEJO COMO EL DE LA REGRESION LOGISTICA BINARIA PARA PODER VER LA PROBABILIDAD DE QUE UNA PROVINCIA SEA MAS CORRUPTA QUE OTRA

usemos la función “dfSummary” de paquete “summarytools”

library(summarytools)
library(kableExtra)

dfSummary(REALDATA,
          plain.ascii  = FALSE,
          varnumbers = FALSE,
          style        = "grid", 
          graph.col=F,
          na.col    = FALSE) %>%
    kable(caption = "Descriptivos Univariados")%>%
    kable_styling(full_width = F)
Descriptivos Univariados
Variable Stats / Values Freqs (% of Valid) Valid
PROVINCIA
[character]
|1. ABANCAY
2. ACOBAMBA
3. ACOMAYO
4. AIJA
5. ALTOAMAZONAS
6. AMBO
7. ANDAHUAYLAS
8. ANGARAES
9. ANTA
10. ANTABAMBA
[ 186 others ]
|  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 ( 0.5%)
  1 (
0.5%)
\186 (
DEPARTAMENTO
[character]
|1. ÁNCASH
2. CAJAMARCA
3. CUSCO
4. PUNO
5. LALIBERTAD
6. AYACUCHO
7. HUÁNUCO
8. LIMA
9. SANMARTÍN
10. JUNÍN
[ 15 others ]
|\20 (10.2%)
\13 ( 6.6%)
\13 ( 6.6%)
\13 ( 6.6%)
\12 ( 6.1%)
\11 ( 5.6%)
\11 ( 5.6%)
\10 ( 5.1%)
\10 ( 5.1%)
 9 ( 4.6%)
\7
(37.8%)
CANON
[numeric]
|Mean (sd) : 48733024 (144218544)
min < med < max:
7 < 12069096 < 1607175203
IQR (CV) : 34007795 (3)
|195 distinct values |196
(100
POBLACION
[numeric]
|Mean (sd) : 170391.3 (731061.3)
min < med < max:
3270 < 61863 < 10004141
IQR (CV) : 92694.8 (4.3)
|196 distinct values |196
(100
NOMBRE_OP
[character]
|1. PARTIDODEMOCRATICOSOMOSPE
2. ALIANZAPARAELPROGRESO
3. AVANZAPAIS-PARTIDODEINTEG
4. MOVIMIENTOREGIONALAGUA
5. REFORMAYHONRADEZPORMASOBR
6. ALIANZAGOBIERNOUNIDADYACC
7. TRABAJOMASTRABAJO
8. MOVIMIENTOREGIONALSIERRAY
9. ALIANZAPORNUESTRODESARROL
10. JUNTOSPORELPERU
[ 5 ] | 28 (14.3%)
 17 ( 8.7%)
 10 ( 5.1%)
  7 ( 3.6%)
  7 ( 3.6%)
  6 ( 3.1%)
  6 ( 3.1%)
  5 ( 2.6%)
  4 ( 2.0%)
  4 (
2.0%)
\102 (
TIPO_OP
[numeric]
|Min : 0
Mean : 0.6
Max : 1
|0 : 81 (41.3%)
1 : 115 (58.7%)
|196
(100
INCO2022
[numeric]
|Mean (sd) : 40.6 (11)
min < med < max:
17 < 39.9 < 69.1
IQR (CV) : 13.3 (0.3)
|154 distinct values |196
(100
CANT_TRABAJADOR
[numeric]
|Mean (sd) : 334.8 (610)
min < med < max:
5 < 151 < 6524
IQR (CV) : 303.2 (1.8)
|165 distinct values |196
(100
HOGARES_DEP
[numeric]
|Mean (sd) : 948.7 (1260.8)
min < med < max:
9 < 609.5 < 12286
IQR (CV) : 894.8 (1.3)
|184 distinct values |196
(100
library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following object is masked from 'package:DescTools':
## 
##     %nin%
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff
## The following object is masked from 'package:base':
## 
##     isFALSE
boxplot(normalize(REALDATA[,c(3,4,7,8,9)],method='range',range=c(0,10)))

boxplot(normalize(REALDATA[,c(3,4,7,8,9)],method='standardize'))

REALDATA[,c(3,4,7,8,9)]=normalize(REALDATA[,c(3,4,7,8,9)],method='standardize')
cor(REALDATA[,c(3,4,7,8,9)])
##                     CANON POBLACION  INCO2022 CANT_TRABAJADOR HOGARES_DEP
## CANON           1.0000000 0.0389317 0.2683652       0.2214829   0.0483665
## POBLACION       0.0389317 1.0000000 0.2506731       0.8450474   0.7449397
## INCO2022        0.2683652 0.2506731 1.0000000       0.4776644   0.4045967
## CANT_TRABAJADOR 0.2214829 0.8450474 0.4776644       1.0000000   0.7879363
## HOGARES_DEP     0.0483665 0.7449397 0.4045967       0.7879363   1.0000000
dataClus=REALDATA[,c(3,4,7,8,9)]
row.names(dataClus)=REALDATA$PROVINCIA
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

library(cluster)
g.dist = daisy(dataClus, metric="gower")
library(kableExtra)
library(factoextra)
set.seed(123)
res.pam=pam(g.dist,3,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
CANON POBLACION INCO2022 CANT_TRABAJADOR HOGARES_DEP pam
ABANCAY -0.1498904 -0.0668758 1.4822061 0.3101250 -0.2718360 1
ACOBAMBA -0.3073918 -0.1863213 -0.0333371 -0.2570812 -0.0957619 2
ACOMAYO -0.0914509 -0.2006115 -1.6124061 -0.3701946 -0.4637726 3
AIJA -0.2640939 -0.2246341 -0.4417170 -0.5341270 -0.7104349 3
ALTOAMAZONAS -0.2931565 -0.0207306 0.5474698 -0.2997036 0.5664987 2
AMBO -0.3379103 -0.1614684 0.3205921 -0.1915082 -0.0037593 2
ANDAHUAYLAS -0.0666370 -0.0255250 0.5293196 0.0576691 0.7909534 2
ANGARAES -0.2791050 -0.1620758 -1.4036786 -0.0915094 0.3729758 3
ANTA 0.2659053 -0.1453973 -0.4961677 0.5281552 -0.2353522 2
ANTABAMBA -0.2803191 -0.2174405 -2.0026358 -0.4636360 -0.6366742 3
ANTONIORAYMONDI -0.2399866 -0.2147389 -0.0787127 -0.5357663 -0.5121533 2
AREQUIPA 4.1575107 1.4444928 1.7181589 1.9330561 0.8266441 1
ASCOPE -0.2332925 -0.0621662 0.6019205 -0.4095383 -0.1607983 2
ASUNCION -0.3014880 -0.2228737 -1.9209598 -0.4931439 -0.6319154 3
ATALAYA -0.3003224 -0.1457433 0.4657939 -0.0456083 -0.0378637 2
fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
##  [1] "BARRANCA"            "CAJATAMBO"           "CELENDIN"           
##  [4] "CHACHAPOYAS"         "COTABAMBAS"          "DATEMDELMARANON"    
##  [7] "FERRENAFE"           "GENERALSANCHEZCERRO" "ILO"                
## [10] "NASCA"               "OCROS"               "OXAPAMPA"           
## [13] "PASCO"               "POMABAMBA"           "SANMARTIN"          
## [16] "SATIPO"              "SIHUAS"              "TAMBOPATA"          
## [19] "TOCACHE"             "VIRU"
aggregate(.~ pam, data=dataClus,mean)
##   pam      CANON  POBLACION    INCO2022 CANT_TRABAJADOR HOGARES_DEP
## 1   1  0.6558249  0.5643609  1.47313096       1.0620151  0.73816216
## 2   2 -0.1197961 -0.1230763  0.04380129      -0.2042046 -0.03559762
## 3   3 -0.2657684 -0.1943390 -1.13492800      -0.4128170 -0.46975583
REALDATA$pamINCOpoor=REALDATA$PROVINCIA%in%poorPAM
REALDATA$pamINCO=as.ordered(dataClus$pam)
dataClus$pam=NULL
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes") 

set.seed(123)
library(factoextra)

res.agnes<- hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
CANON POBLACION INCO2022 CANT_TRABAJADOR HOGARES_DEP agnes
ABANCAY -0.1498904 -0.0668758 1.4822061 0.3101250 -0.2718360 1
ACOBAMBA -0.3073918 -0.1863213 -0.0333371 -0.2570812 -0.0957619 2
ACOMAYO -0.0914509 -0.2006115 -1.6124061 -0.3701946 -0.4637726 2
AIJA -0.2640939 -0.2246341 -0.4417170 -0.5341270 -0.7104349 2
ALTOAMAZONAS -0.2931565 -0.0207306 0.5474698 -0.2997036 0.5664987 1
AMBO -0.3379103 -0.1614684 0.3205921 -0.1915082 -0.0037593 1
ANDAHUAYLAS -0.0666370 -0.0255250 0.5293196 0.0576691 0.7909534 1
ANGARAES -0.2791050 -0.1620758 -1.4036786 -0.0915094 0.3729758 2
ANTA 0.2659053 -0.1453973 -0.4961677 0.5281552 -0.2353522 2
ANTABAMBA -0.2803191 -0.2174405 -2.0026358 -0.4636360 -0.6366742 2
ANTONIORAYMONDI -0.2399866 -0.2147389 -0.0787127 -0.5357663 -0.5121533 2
AREQUIPA 4.1575107 1.4444928 1.7181589 1.9330561 0.8266441 1
ASCOPE -0.2332925 -0.0621662 0.6019205 -0.4095383 -0.1607983 1
ASUNCION -0.3014880 -0.2228737 -1.9209598 -0.4931439 -0.6319154 2
ATALAYA -0.3003224 -0.1457433 0.4657939 -0.0456083 -0.0378637 1
fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES
##  [1] "AMBO"                 "ASCOPE"               "ATALAYA"             
##  [4] "CAMANA"               "CASMA"                "CHANCHAMAYO"         
##  [7] "CHEPEN"               "CHINCHA"              "CHUCUITO"            
## [10] "CONTRALMIRANTEVILLAR" "ELCOLLAO"             "HUARMEY"             
## [13] "ISLAY"                "JUNIN"                "PAUCARTAMBO"         
## [16] "SANIGNACIO"           "TALARA"               "UCAYALI"
aggregate(.~ agnes, data=dataClus,mean)
##   agnes      CANON   POBLACION   INCO2022 CANT_TRABAJADOR HOGARES_DEP
## 1     1  0.3481577  0.09024428  0.9943282       0.4146046   0.3119474
## 2     2 -0.2158448 -0.16849713 -0.6334287      -0.3436785  -0.2698994
## 3     3 -0.2104557 13.45133477  1.4368305      10.1460709   8.9918808
REALDATA$agnesINCOpoor=REALDATA$PROVINCIA%in%poorAGNES
REALDATA$agnesINCO=as.ordered(dataClus$agnes)
dataClus$agnes=NULL
table(REALDATA$pamINCO,REALDATA$agnesINCO,dnn = c('Particion','Aglomeracion'))
##          Aglomeracion
## Particion  1  2  3
##         1 40  0  1
##         2 35 63  0
##         3  0 57  0
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

set.seed(123)
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
CANON POBLACION INCO2022 CANT_TRABAJADOR HOGARES_DEP diana
ABANCAY -0.1498904 -0.0668758 1.4822061 0.3101250 -0.2718360 1
ACOBAMBA -0.3073918 -0.1863213 -0.0333371 -0.2570812 -0.0957619 2
ACOMAYO -0.0914509 -0.2006115 -1.6124061 -0.3701946 -0.4637726 2
AIJA -0.2640939 -0.2246341 -0.4417170 -0.5341270 -0.7104349 2
ALTOAMAZONAS -0.2931565 -0.0207306 0.5474698 -0.2997036 0.5664987 2
AMBO -0.3379103 -0.1614684 0.3205921 -0.1915082 -0.0037593 2
ANDAHUAYLAS -0.0666370 -0.0255250 0.5293196 0.0576691 0.7909534 2
ANGARAES -0.2791050 -0.1620758 -1.4036786 -0.0915094 0.3729758 2
ANTA 0.2659053 -0.1453973 -0.4961677 0.5281552 -0.2353522 2
ANTABAMBA -0.2803191 -0.2174405 -2.0026358 -0.4636360 -0.6366742 2
ANTONIORAYMONDI -0.2399866 -0.2147389 -0.0787127 -0.5357663 -0.5121533 2
AREQUIPA 4.1575107 1.4444928 1.7181589 1.9330561 0.8266441 1
ASCOPE -0.2332925 -0.0621662 0.6019205 -0.4095383 -0.1607983 2
ASUNCION -0.3014880 -0.2228737 -1.9209598 -0.4931439 -0.6319154 2
ATALAYA -0.3003224 -0.1457433 0.4657939 -0.0456083 -0.0378637 2
fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']%>%sort()
poorDIANA
## character(0)
aggregate(.~ diana, data=dataClus,mean)
##   diana      CANON  POBLACION   INCO2022 CANT_TRABAJADOR HOGARES_DEP
## 1     1  0.7575334  0.2876823  1.5433982       0.9628571   0.6335971
## 2     2 -0.1643951 -0.1470013 -0.3465986      -0.2740379  -0.1947986
## 3     3 -0.2104557 13.4513348  1.4368305      10.1460709   8.9918808