Trabajo final

# Cargar librerías
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Importar los datos
data <- read.csv("DATA PROVINCIAS_CHATA - Data.csv", stringsAsFactors = FALSE)

# Limpiar y transformar variables numéricas
data <- data %>%
  mutate(across(c(PORC_GAN, POR_POB_ELEC, SERV_GAS, SERV_AGUA, EDUC_PBL, CP_MED, IND_CORR),
                ~ as.numeric(gsub(",", ".", .)))) # Reemplazar comas y convertir a numérico

# Verificar la estructura de los datos
str(data)

## 'data.frame':    196 obs. of  10 variables:
##  $ PROVINCIA   : chr  "Chachapoyas " "Bagua" "Bongará " "Condorcanqui " ...
##  $ PORC_GAN    : num  59.9 91.1 61.6 90.7 61.7 ...
##  $ POR_POB_ELEC: num  64.1 73.4 77.9 74.5 73.7 ...
##  $ MCR         : chr  "MACROREGION ORIENTE" "MACROREGION ORIENTE" "MACROREGION ORIENTE" "MACROREGION ORIENTE" ...
##  $ SERV_GAS    : num  65.3 45.9 56.1 15.4 45.3 ...
##  $ SERV_AGUA   : num  87.6 70.8 83 22.4 80.5 ...
##  $ EDUC_PBL    : num  46.8 78.1 61.5 113.3 85.5 ...
##  $ CP_MED      : num  13.8 11 13.8 15 16.1 ...
##  $ POBR_MON    : int  4 3 3 1 2 3 3 3 2 3 ...
##  $ IND_CORR    : num  62.7 36.3 27.7 30.2 33.3 21.8 57.6 33.5 41.2 21.4 ...

data <- data %>%
  mutate(across(c(PORC_GAN, POR_POB_ELEC, SERV_GAS, SERV_AGUA, EDUC_PBL, CP_MED, IND_CORR),
                ~ as.numeric(gsub(",", ".", .)))) %>%  # Reemplazar comas y convertir a numérico
  mutate(MCR = as.factor(MCR))  # Asegurar que la macroregión es categórica

colnames(data)

##  [1] "PROVINCIA"    "PORC_GAN"     "POR_POB_ELEC" "MCR"          "SERV_GAS"    
##  [6] "SERV_AGUA"    "EDUC_PBL"     "CP_MED"       "POBR_MON"     "IND_CORR"

colnames(data)[colnames(data) == "PORC_GAN "] <- "PORC_GAN"

modelo1 <- lm(PORC_GAN ~ SERV_GAS + SERV_AGUA, data = data)
summary(modelo1)

## 
## Call:
## lm(formula = PORC_GAN ~ SERV_GAS + SERV_AGUA, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.763 -11.725  -1.039  13.352  34.037 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.19608    4.60506  16.980  < 2e-16 ***
## SERV_GAS    -0.42323    0.05674  -7.459 2.89e-12 ***
## SERV_AGUA    0.13388    0.06837   1.958   0.0516 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 193 degrees of freedom
## Multiple R-squared:  0.2267, Adjusted R-squared:  0.2187 
## F-statistic: 28.29 on 2 and 193 DF,  p-value: 1.684e-11

reg1=lm(modelo1,data=data)
summary(reg1)

## 
## Call:
## lm(formula = modelo1, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.763 -11.725  -1.039  13.352  34.037 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.19608    4.60506  16.980  < 2e-16 ***
## SERV_GAS    -0.42323    0.05674  -7.459 2.89e-12 ***
## SERV_AGUA    0.13388    0.06837   1.958   0.0516 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 193 degrees of freedom
## Multiple R-squared:  0.2267, Adjusted R-squared:  0.2187 
## F-statistic: 28.29 on 2 and 193 DF,  p-value: 1.684e-11

library(modelsummary)

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

model1=list('apropiacion (I)'=reg1)
modelsummary(model1, title = "Regresion: modelo 1",
             stars = TRUE,
             output = "kableExtra")

Regresion: modelo 1
	apropiacion (I)
(Intercept)	78.196***
	(4.605)
SERV_GAS	−0.423***
	(0.057)
SERV_AGUA	0.134+
	(0.068)
Num.Obs.	196
R2	0.227
R2 Adj.	0.219
AIC	1655.9
BIC	1669.0
Log.Lik.	−823.943
F	28.288
RMSE	16.20
+ p

modelo2 <- lm(PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL , data = data)
summary(modelo2)

## 
## Call:
## lm(formula = PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.126 -12.240  -1.196  13.276  33.331 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 85.96018    8.56227  10.039  < 2e-16 ***
## SERV_GAS    -0.48314    0.07950  -6.077 6.46e-09 ***
## SERV_AGUA    0.11094    0.07159   1.550    0.123    
## EDUC_PBL    -0.04844    0.04504  -1.075    0.284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 192 degrees of freedom
## Multiple R-squared:  0.2313, Adjusted R-squared:  0.2193 
## F-statistic: 19.26 on 3 and 192 DF,  p-value: 5.839e-11

reg2=lm(modelo2,data=data)
summary(reg2)

## 
## Call:
## lm(formula = modelo2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.126 -12.240  -1.196  13.276  33.331 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 85.96018    8.56227  10.039  < 2e-16 ***
## SERV_GAS    -0.48314    0.07950  -6.077 6.46e-09 ***
## SERV_AGUA    0.11094    0.07159   1.550    0.123    
## EDUC_PBL    -0.04844    0.04504  -1.075    0.284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 192 degrees of freedom
## Multiple R-squared:  0.2313, Adjusted R-squared:  0.2193 
## F-statistic: 19.26 on 3 and 192 DF,  p-value: 5.839e-11

model2=list('apropiacion (II)'=reg2)
modelsummary(model2, title = "Regresion: modelo 2",
             stars = TRUE,
             output = "kableExtra")

Regresion: modelo 2
	apropiacion (II)
(Intercept)	85.960***
	(8.562)
SERV_GAS	−0.483***
	(0.080)
SERV_AGUA	0.111
	(0.072)
EDUC_PBL	−0.048
	(0.045)
Num.Obs.	196
R2	0.231
R2 Adj.	0.219
AIC	1656.7
BIC	1673.1
Log.Lik.	−823.354
F	19.259
RMSE	16.15
+ p

modelo3 <- lm(PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED , data = data)
summary(modelo3)

## 
## Call:
## lm(formula = PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.914 -11.981  -0.503  13.350  33.144 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 87.58302    9.05060   9.677  < 2e-16 ***
## SERV_GAS    -0.50122    0.08590  -5.835 2.27e-08 ***
## SERV_AGUA    0.10209    0.07343   1.390    0.166    
## EDUC_PBL    -0.07525    0.06566  -1.146    0.253    
## CP_MED       0.19592    0.34863   0.562    0.575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.35 on 191 degrees of freedom
## Multiple R-squared:  0.2326, Adjusted R-squared:  0.2165 
## F-statistic: 14.47 on 4 and 191 DF,  p-value: 2.432e-10

reg3=lm(modelo3,data=data)
summary(reg3)

## 
## Call:
## lm(formula = modelo3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.914 -11.981  -0.503  13.350  33.144 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 87.58302    9.05060   9.677  < 2e-16 ***
## SERV_GAS    -0.50122    0.08590  -5.835 2.27e-08 ***
## SERV_AGUA    0.10209    0.07343   1.390    0.166    
## EDUC_PBL    -0.07525    0.06566  -1.146    0.253    
## CP_MED       0.19592    0.34863   0.562    0.575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.35 on 191 degrees of freedom
## Multiple R-squared:  0.2326, Adjusted R-squared:  0.2165 
## F-statistic: 14.47 on 4 and 191 DF,  p-value: 2.432e-10

model3=list('apropiacion (III)'=reg3)
modelsummary(model3, title = "Regresion: modelo 3",
             stars = TRUE,
             output = "kableExtra")

Regresion: modelo 3
	apropiacion (III)
(Intercept)	87.583***
	(9.051)
SERV_GAS	−0.501***
	(0.086)
SERV_AGUA	0.102
	(0.073)
EDUC_PBL	−0.075
	(0.066)
CP_MED	0.196
	(0.349)
Num.Obs.	196
R2	0.233
R2 Adj.	0.217
AIC	1658.4
BIC	1678.1
Log.Lik.	−823.192
F	14.472
RMSE	16.14
+ p

modelo4 <- lm(PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED + IND_CORR , data = data)
summary(modelo4)

## 
## Call:
## lm(formula = PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED + 
##     IND_CORR, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.899 -11.821  -0.009  13.439  33.966 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 98.43629   11.23794   8.759 1.08e-15 ***
## SERV_GAS    -0.47972    0.08656  -5.542 9.89e-08 ***
## SERV_AGUA    0.08975    0.07351   1.221    0.224    
## EDUC_PBL    -0.09846    0.06694  -1.471    0.143    
## CP_MED       0.17941    0.34731   0.517    0.606    
## IND_CORR    -0.21956    0.13581  -1.617    0.108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.28 on 190 degrees of freedom
## Multiple R-squared:  0.243,  Adjusted R-squared:  0.2231 
## F-statistic:  12.2 on 5 and 190 DF,  p-value: 2.917e-10

reg4=lm(modelo4,data=data)
summary(reg4)

## 
## Call:
## lm(formula = modelo4, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.899 -11.821  -0.009  13.439  33.966 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 98.43629   11.23794   8.759 1.08e-15 ***
## SERV_GAS    -0.47972    0.08656  -5.542 9.89e-08 ***
## SERV_AGUA    0.08975    0.07351   1.221    0.224    
## EDUC_PBL    -0.09846    0.06694  -1.471    0.143    
## CP_MED       0.17941    0.34731   0.517    0.606    
## IND_CORR    -0.21956    0.13581  -1.617    0.108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.28 on 190 degrees of freedom
## Multiple R-squared:  0.243,  Adjusted R-squared:  0.2231 
## F-statistic:  12.2 on 5 and 190 DF,  p-value: 2.917e-10

model4=list('apropiacion (IV)'=reg4)
modelsummary(model4, title = "Regresion: modelo 4",
             stars = TRUE,
             output = "kableExtra")

Regresion: modelo 4
	apropiacion (IV)
(Intercept)	98.436***
	(11.238)
SERV_GAS	−0.480***
	(0.087)
SERV_AGUA	0.090
	(0.074)
EDUC_PBL	−0.098
	(0.067)
CP_MED	0.179
	(0.347)
IND_CORR	−0.220
	(0.136)
Num.Obs.	196
R2	0.243
R2 Adj.	0.223
AIC	1657.7
BIC	1680.7
Log.Lik.	−821.853
F	12.198
RMSE	16.03
+ p

models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3,
            'apropiacion (IV)' = reg4)
modelsummary(models, title = "Resultados de todos los modelos",
             stars = TRUE,
             output = "kableExtra")

Resultados de todos los modelos
	apropiacion (I)	apropiacion (II)	apropiacion (III)	apropiacion (IV)
(Intercept)	78.196***	85.960***	87.583***	98.436***
	(4.605)	(8.562)	(9.051)	(11.238)
SERV_GAS	−0.423***	−0.483***	−0.501***	−0.480***
	(0.057)	(0.080)	(0.086)	(0.087)
SERV_AGUA	0.134+	0.111	0.102	0.090
	(0.068)	(0.072)	(0.073)	(0.074)
EDUC_PBL		−0.048	−0.075	−0.098
		(0.045)	(0.066)	(0.067)
CP_MED			0.196	0.179
			(0.349)	(0.347)
IND_CORR				−0.220
				(0.136)
Num.Obs.	196	196	196	196
R2	0.227	0.231	0.233	0.243
R2 Adj.	0.219	0.219	0.217	0.223
AIC	1655.9	1656.7	1658.4	1657.7
BIC	1669.0	1673.1	1678.1	1680.7
Log.Lik.	−823.943	−823.354	−823.192	−821.853
F	28.288	19.259	14.472	12.198
RMSE	16.20	16.15	16.14	16.03
+ p

models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3,
            'apropiacion (IV)'=reg4)
modelsummary(models, title = "Resultados de todos los modelos",statistic = "conf.int",
             stars = TRUE,
             output = "kableExtra")

Resultados de todos los modelos
	apropiacion (I)	apropiacion (II)	apropiacion (III)	apropiacion (IV)
(Intercept)	78.196***	85.960***	87.583***	98.436***
	[69.113, 87.279]	[69.072, 102.848]	[69.731, 105.435]	[76.269, 120.603]
SERV_GAS	−0.423***	−0.483***	−0.501***	−0.480***
	[−0.535, −0.311]	[−0.640, −0.326]	[−0.671, −0.332]	[−0.650, −0.309]
SERV_AGUA	0.134+	0.111	0.102	0.090
	[−0.001, 0.269]	[−0.030, 0.252]	[−0.043, 0.247]	[−0.055, 0.235]
EDUC_PBL		−0.048	−0.075	−0.098
		[−0.137, 0.040]	[−0.205, 0.054]	[−0.231, 0.034]
CP_MED			0.196	0.179
			[−0.492, 0.884]	[−0.506, 0.864]
IND_CORR				−0.220
				[−0.487, 0.048]
Num.Obs.	196	196	196	196
R2	0.227	0.231	0.233	0.243
R2 Adj.	0.219	0.219	0.217	0.223
AIC	1655.9	1656.7	1658.4	1657.7
BIC	1669.0	1673.1	1678.1	1680.7
Log.Lik.	−823.943	−823.354	−823.192	−821.853
F	28.288	19.259	14.472	12.198
RMSE	16.20	16.15	16.14	16.03
+ p

library(ggplot2)
library(sjPlot)

## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!

plot_models(reg1,reg2,reg3, reg4, vline.color = "black",m.labels=c("Modelo 1","Modelo 2","Modelo 3", "Modelo 4"),dot.size = 1,line.size = 0.6)

modelo5_c <- lm(PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED + IND_CORR + POR_POB_ELEC + MCR, data = data)
summary(modelo5_c)

## 
## Call:
## lm(formula = PORC_GAN ~ SERV_GAS + SERV_AGUA + EDUC_PBL + CP_MED + 
##     IND_CORR + POR_POB_ELEC + MCR, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.810  -6.446  -0.136   6.730  31.594 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            69.36970   10.34884   6.703 2.39e-10 ***
## SERV_GAS               -0.62261    0.05878 -10.592  < 2e-16 ***
## SERV_AGUA               0.23087    0.05287   4.366 2.10e-05 ***
## EDUC_PBL               -0.07761    0.04487  -1.730   0.0854 .  
## CP_MED                 -0.02291    0.23678  -0.097   0.9230    
## IND_CORR                0.07253    0.09021   0.804   0.4224    
## POR_POB_ELEC            0.02800    0.07932   0.353   0.7245    
## MCRMACROREGION CENTRO  16.01045    3.61759   4.426 1.64e-05 ***
## MCRMACROREGION NORTE   -1.09847    3.69987  -0.297   0.7669    
## MCRMACROREGION ORIENTE  4.03745    3.94642   1.023   0.3076    
## MCRMACROREGION SUR     32.22452    3.68252   8.751 1.30e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.54 on 185 degrees of freedom
## Multiple R-squared:  0.6907, Adjusted R-squared:  0.674 
## F-statistic: 41.31 on 10 and 185 DF,  p-value: < 2.2e-16

reg5=lm(modelo5_c,data=data)
summary(reg5)

## 
## Call:
## lm(formula = modelo5_c, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.810  -6.446  -0.136   6.730  31.594 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            69.36970   10.34884   6.703 2.39e-10 ***
## SERV_GAS               -0.62261    0.05878 -10.592  < 2e-16 ***
## SERV_AGUA               0.23087    0.05287   4.366 2.10e-05 ***
## EDUC_PBL               -0.07761    0.04487  -1.730   0.0854 .  
## CP_MED                 -0.02291    0.23678  -0.097   0.9230    
## IND_CORR                0.07253    0.09021   0.804   0.4224    
## POR_POB_ELEC            0.02800    0.07932   0.353   0.7245    
## MCRMACROREGION CENTRO  16.01045    3.61759   4.426 1.64e-05 ***
## MCRMACROREGION NORTE   -1.09847    3.69987  -0.297   0.7669    
## MCRMACROREGION ORIENTE  4.03745    3.94642   1.023   0.3076    
## MCRMACROREGION SUR     32.22452    3.68252   8.751 1.30e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.54 on 185 degrees of freedom
## Multiple R-squared:  0.6907, Adjusted R-squared:  0.674 
## F-statistic: 41.31 on 10 and 185 DF,  p-value: < 2.2e-16

model5_c=list('apropiacion (V)'=reg5)
modelsummary(model5_c, title = "Regresion: modelo 5 Control",
             stars = TRUE,
             output = "kableExtra")

Regresion: modelo 5 Control
	apropiacion (V)
(Intercept)	69.370***
	(10.349)
SERV_GAS	−0.623***
	(0.059)
SERV_AGUA	0.231***
	(0.053)
EDUC_PBL	−0.078+
	(0.045)
CP_MED	−0.023
	(0.237)
IND_CORR	0.073
	(0.090)
POR_POB_ELEC	0.028
	(0.079)
MCRMACROREGION CENTRO	16.010***
	(3.618)
MCRMACROREGION NORTE	−1.098
	(3.700)
MCRMACROREGION ORIENTE	4.037
	(3.946)
MCRMACROREGION SUR	32.225***
	(3.683)
Num.Obs.	196
R2	0.691
R2 Adj.	0.674
AIC	1492.3
BIC	1531.6
Log.Lik.	−734.144
F	41.309
RMSE	10.24
+ p

models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3,
            'apropiacion (IV)' = reg4,
            'apropiacion (V)' = reg5)
modelsummary(models, title = "Resultados de todos los modelos",
             stars = TRUE,
             output = "kableExtra")

Resultados de todos los modelos
	apropiacion (I)	apropiacion (II)	apropiacion (III)	apropiacion (IV)	apropiacion (V)
(Intercept)	78.196***	85.960***	87.583***	98.436***	69.370***
	(4.605)	(8.562)	(9.051)	(11.238)	(10.349)
SERV_GAS	−0.423***	−0.483***	−0.501***	−0.480***	−0.623***
	(0.057)	(0.080)	(0.086)	(0.087)	(0.059)
SERV_AGUA	0.134+	0.111	0.102	0.090	0.231***
	(0.068)	(0.072)	(0.073)	(0.074)	(0.053)
EDUC_PBL		−0.048	−0.075	−0.098	−0.078+
		(0.045)	(0.066)	(0.067)	(0.045)
CP_MED			0.196	0.179	−0.023
			(0.349)	(0.347)	(0.237)
IND_CORR				−0.220	0.073
				(0.136)	(0.090)
POR_POB_ELEC					0.028
					(0.079)
MCRMACROREGION CENTRO					16.010***
					(3.618)
MCRMACROREGION NORTE					−1.098
					(3.700)
MCRMACROREGION ORIENTE					4.037
					(3.946)
MCRMACROREGION SUR					32.225***
					(3.683)
Num.Obs.	196	196	196	196	196
R2	0.227	0.231	0.233	0.243	0.691
R2 Adj.	0.219	0.219	0.217	0.223	0.674
AIC	1655.9	1656.7	1658.4	1657.7	1492.3
BIC	1669.0	1673.1	1678.1	1680.7	1531.6
Log.Lik.	−823.943	−823.354	−823.192	−821.853	−734.144
F	28.288	19.259	14.472	12.198	41.309
RMSE	16.20	16.15	16.14	16.03	10.24
+ p

models=list('apropiacion (I)'=reg1,
            'apropiacion (II)'=reg2,
            'apropiacion (III)'=reg3,
            'apropiacion (IV)'=reg4,
            'apropiacion (V)'=reg5)
modelsummary(models, title = "Resultados de todos los modelos",statistic = "conf.int",
             stars = TRUE,
             output = "kableExtra")

Resultados de todos los modelos
	apropiacion (I)	apropiacion (II)	apropiacion (III)	apropiacion (IV)	apropiacion (V)
(Intercept)	78.196***	85.960***	87.583***	98.436***	69.370***
	[69.113, 87.279]	[69.072, 102.848]	[69.731, 105.435]	[76.269, 120.603]	[48.953, 89.787]
SERV_GAS	−0.423***	−0.483***	−0.501***	−0.480***	−0.623***
	[−0.535, −0.311]	[−0.640, −0.326]	[−0.671, −0.332]	[−0.650, −0.309]	[−0.739, −0.507]
SERV_AGUA	0.134+	0.111	0.102	0.090	0.231***
	[−0.001, 0.269]	[−0.030, 0.252]	[−0.043, 0.247]	[−0.055, 0.235]	[0.127, 0.335]
EDUC_PBL		−0.048	−0.075	−0.098	−0.078+
		[−0.137, 0.040]	[−0.205, 0.054]	[−0.231, 0.034]	[−0.166, 0.011]
CP_MED			0.196	0.179	−0.023
			[−0.492, 0.884]	[−0.506, 0.864]	[−0.490, 0.444]
IND_CORR				−0.220	0.073
				[−0.487, 0.048]	[−0.105, 0.250]
POR_POB_ELEC					0.028
					[−0.128, 0.184]
MCRMACROREGION CENTRO					16.010***
					[8.873, 23.147]
MCRMACROREGION NORTE					−1.098
					[−8.398, 6.201]
MCRMACROREGION ORIENTE					4.037
					[−3.748, 11.823]
MCRMACROREGION SUR					32.225***
					[24.959, 39.490]
Num.Obs.	196	196	196	196	196
R2	0.227	0.231	0.233	0.243	0.691
R2 Adj.	0.219	0.219	0.217	0.223	0.674
AIC	1655.9	1656.7	1658.4	1657.7	1492.3
BIC	1669.0	1673.1	1678.1	1680.7	1531.6
Log.Lik.	−823.943	−823.354	−823.192	−821.853	−734.144
F	28.288	19.259	14.472	12.198	41.309
RMSE	16.20	16.15	16.14	16.03	10.24
+ p

library(ggplot2)
library(sjPlot)


plot_models(reg1,reg2,reg3, reg4, reg5, vline.color = "black",m.labels=c("Modelo 1","Modelo 2","Modelo 3", "Modelo 4", "Modelo 5 Control"),dot.size = 1,line.size = 0.6)

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(knitr)
tanova=anova(reg1,reg2,reg3, reg4, reg5)

kable(tanova,
      caption = "Comparación de modelos")%>%kableExtra::kable_styling(full_width = FALSE)

Comparación de modelos
Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
193	51421.63	NA	NA	NA	NA
192	51113.74	1	307.88484	2.7692677	0.0977830
191	51029.36	1	84.38012	0.7589563	0.3847844
190	50336.86	1	692.50283	6.2287112	0.0134445
185	20568.14	5	29768.71750	53.5508988	0.0000000

#Analisis factorial

names(data)

##  [1] "PROVINCIA"    "PORC_GAN"     "POR_POB_ELEC" "MCR"          "SERV_GAS"    
##  [6] "SERV_AGUA"    "EDUC_PBL"     "CP_MED"       "POBR_MON"     "IND_CORR"

# Cargar librerías
library(magrittr)
library(dplyr)


# Definir columnas a excluir (puedes personalizar según tu archivo)
dontselect <- c("PROVINCIA", "MCR", "POBR_MON")  # Excluye columnas categóricas o irrelevantes

# Seleccionar columnas numéricas
select <- setdiff(names(data), dontselect)
theData <- data[, select]

# Limpiar las columnas numéricas (si hay comas en lugar de puntos)
theData <- theData %>%
  mutate(across(everything(), ~ as.numeric(gsub(",", ".", .))))

# Mostrar las primeras filas como tabla
library(rmarkdown)
head(theData, 10) %>%
  rmarkdown::paged_table()

library(polycor)

corMatrix=polycor::hetcor(theData)$correlations

round(corMatrix,2)

##              PORC_GAN POR_POB_ELEC SERV_GAS SERV_AGUA EDUC_PBL CP_MED IND_CORR
## PORC_GAN         1.00         0.17    -0.46     -0.06     0.28   0.13    -0.29
## POR_POB_ELEC     0.17         1.00    -0.27     -0.10     0.36   0.21    -0.24
## SERV_GAS        -0.46        -0.27     1.00      0.38    -0.75  -0.35     0.50
## SERV_AGUA       -0.06        -0.10     0.38      1.00    -0.47  -0.19     0.18
## EDUC_PBL         0.28         0.36    -0.75     -0.47     1.00   0.71    -0.55
## CP_MED           0.13         0.21    -0.35     -0.19     0.71   1.00    -0.39
## IND_CORR        -0.29        -0.24     0.50      0.18    -0.55  -0.39     1.00

library(ggcorrplot)

ggcorrplot(corMatrix)

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:polycor':
## 
##     polyserial

## The following object is masked from 'package:modelsummary':
## 
##     SD

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.67
## MSA for each item = 
##     PORC_GAN POR_POB_ELEC     SERV_GAS    SERV_AGUA     EDUC_PBL       CP_MED 
##         0.70         0.81         0.66         0.70         0.61         0.56 
##     IND_CORR 
##         0.92

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05

## [1] FALSE

library(matrixcalc)

is.singular.matrix(corMatrix)

## [1] FALSE

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

##SI BIEN ES CIERTO AQUI ME SALE 2 FACTORES, cada vez que lo corro en otro archivo, salen 3 por lo que no supe muy bien como cambiarlo en mi dashboard

library(GPArotation)

## 
## Attaching package: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

resfa <- fa(theData,
            nfactors = 2,
            cor = 'mixed',
            rotate = "varimax", #oblimin?
            fm="minres")

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

print(resfa$loadings)

## 
## Loadings:
##              MR1    MR2   
## PORC_GAN            -0.536
## POR_POB_ELEC  0.272 -0.228
## SERV_GAS     -0.452  0.793
## SERV_AGUA    -0.353  0.199
## EDUC_PBL      0.939 -0.391
## CP_MED        0.694       
## IND_CORR     -0.439  0.400
## 
##                  MR1   MR2
## SS loadings    1.966 1.329
## Proportion Var 0.281 0.190
## Cumulative Var 0.281 0.471

print(resfa$loadings,cutoff = 0.5)

## 
## Loadings:
##              MR1    MR2   
## PORC_GAN            -0.536
## POR_POB_ELEC              
## SERV_GAS             0.793
## SERV_AGUA                 
## EDUC_PBL      0.939       
## CP_MED        0.694       
## IND_CORR                  
## 
##                  MR1   MR2
## SS loadings    1.966 1.329
## Proportion Var 0.281 0.190
## Cumulative Var 0.281 0.471

fa.diagram(resfa,main = "Resultados")

sort(resfa$communality)

## POR_POB_ELEC    SERV_AGUA     PORC_GAN     IND_CORR       CP_MED     SERV_GAS 
##    0.1258413    0.1638421    0.2952804    0.3527498    0.4893836    0.8339456 
##     EDUC_PBL 
##    1.0345268

sort(resfa$complexity)

##       CP_MED     PORC_GAN     EDUC_PBL    SERV_AGUA     SERV_GAS POR_POB_ELEC 
##     1.032948     1.052794     1.337492     1.575692     1.588159     1.939394 
##     IND_CORR 
##     1.983142

#TUCKER LEWIS

resfa$TLI

## [1] 0.8967672

no es mayor

#RMS CERCA A CERO

resfa$rms

## [1] 0.03902024

#RMSEA cerca a cero

resfa$RMSEA

##      RMSEA      lower      upper confidence 
## 0.11343509 0.07008286 0.16082036 0.90000000

#BIC

resfa$BIC

## [1] -14.00762

#indices

as.data.frame(resfa$scores)%>%head()

##          MR1        MR2
## 1 -0.2368571  0.9245780
## 2  0.2398385 -0.1773225
## 3 -0.2017179  0.2070784
## 4  0.2661158 -1.2595331
## 5  0.4263647  0.0973378
## 6 -0.3153642 -0.4514099

library(psych)
library(ggplot2)

# Seleccionar las variables relevantes
variables <- data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")]

# Escalar las variables
variables_scaled <- scale(variables)

# Realizar el análisis factorial exploratorio (EFA) con 2 factores
resfa <- fa(variables_scaled, nfactors = 2, rotate = "varimax", fm = "ml")

# Asignar puntajes factoriales al dataset original
data$Factor1 <- resfa$scores[,1]  # Puntajes del primer factor
data$Factor2 <- resfa$scores[,2]  # Puntajes del segundo factor

# Generar el gráfico comparando una variable original con el primer puntaje factorial
ggplot(data = data, aes(x = SERV_GAS, y = Factor1)) +
  geom_point() +
  theme_minimal() +
  labs(
    x = "Servicio de Gas (original)",
    y = "Puntaje Factorial 1 "
  )

# normalizando
library(BBmisc)

## 
## Attaching package: 'BBmisc'

## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff

## The following object is masked from 'package:base':
## 
##     isFALSE

#Analisis de conglomerados

# Seleccionar las variables relevantes
variables <- data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON" )]

# Estandarizar las variables (si las escalas son diferentes)
variables_scaled <- scale(variables)

# Visualizar la distribución de las variables con boxplots
boxplot(variables_scaled, horizontal = FALSE, las = 2, cex.axis = 0.7)

library(BBmisc)

# Cargar librería
library(scales)

## 
## Attaching package: 'scales'

## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# Seleccionar y normalizar variables relevantes
variables <- data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")]

# Normalizar variables al rango 0-10
variables_normalized <- as.data.frame(lapply(variables, rescale, to = c(0, 10)))

# Visualizar la distribución con boxplot
boxplot(variables_normalized, horizontal = FALSE, las = 2, cex.axis = 0.7,
        main = "Distribución de Variables Normalizadas (Rango 0-10)")

# Normalizar o estandarizar las variables (opcional, dependiendo de los rangos)
variables_scaled <- scale(variables)

# Calcular la matriz de correlación
cor_matrix <- cor(variables_scaled, use = "pairwise.complete.obs")

# Mostrar la matriz de correlación
print(cor_matrix)

##            SERV_GAS SERV_AGUA  IND_CORR  POBR_MON
## SERV_GAS  1.0000000 0.3827218 0.4957952 0.8070535
## SERV_AGUA 0.3827218 1.0000000 0.1827262 0.3168633
## IND_CORR  0.4957952 0.1827262 1.0000000 0.3739093
## POBR_MON  0.8070535 0.3168633 0.3739093 1.0000000

# Seleccionar variables relevantes
dataClus <- data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")]

# Escalar las variables
dataClus <- scale(dataClus)

# Establecer nombres de fila (opcional, si tienes identificadores únicos como provincias)
row.names(dataClus) <- data$PROVINCIA

# Calcular matriz de distancias
dist_matrix <- dist(dataClus, method = "euclidean")

# Librerías necesarias
library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# Escalar las variables seleccionadas
variables <- scale(data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")])

# Calcular el número óptimo de clusters con Gap Statistic
fviz_nbclust(
  variables,        # Variables escaladas
  FUN = hcut,       # Función de clusterización jerárquica
  method = "gap_stat",  # Método de evaluación
  k.max = 10,       # Máximo número de clusters evaluados
  verbose = FALSE   # Desactiva mensajes en consola
)

library(factoextra)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Escalar las variables seleccionadas
dataClus <- scale(data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")])

# Realizar la clusterización jerárquica aglomerativa (AGNES) con linkage ward.D
set.seed(123)
res.agnes <- hcut(
  dataClus,                # Datos escalados
  k = 3,                   # Número de clusters deseados
  hc_func = "agnes",       # Método aglomerativo
  hc_method = "ward.D"     # Linkage ward
)

# Agregar los clusters al dataset original
data$agnes_cluster <- res.agnes$cluster

# Mostrar las primeras filas con los resultados
head(data, 15) %>% 
  kbl() %>% 
  kable_styling(full_width = FALSE)

PROVINCIA	PORC_GAN	POR_POB_ELEC	MCR	SERV_GAS	SERV_AGUA	EDUC_PBL	CP_MED	POBR_MON	IND_CORR	Factor1	Factor2	agnes_cluster
Chachapoyas	59.94	64.10	MACROREGION ORIENTE	65.28	87.62	46.84	13.77	4	62.7	0.4330506	0.7543148	1
Bagua	91.14	73.39	MACROREGION ORIENTE	45.91	70.80	78.07	10.98	3	36.3	0.0208357	-0.1343857	2
Bongará	61.65	77.93	MACROREGION ORIENTE	56.13	83.00	61.50	13.79	3	27.7	0.4429481	0.0579086	2
Condorcanqui	90.65	74.54	MACROREGION ORIENTE	15.36	22.38	113.35	15.00	1	30.2	-1.2329723	-0.7684571	3
Luya	61.66	73.69	MACROREGION ORIENTE	45.28	80.50	85.52	16.10	2	33.3	-0.1435424	0.0129080	2
Rodríguez de Mendoza	59.20	68.08	MACROREGION ORIENTE	38.98	73.38	69.54	12.48	3	21.8	0.0282620	-0.6428235	2
Utcubamba	65.53	72.27	MACROREGION ORIENTE	48.57	77.71	62.87	9.72	3	57.6	-0.1835367	0.3241129	1
Aija	62.20	93.35	MACROREGION NORTE	22.76	76.92	149.23	13.99	3	33.5	-0.5790006	-1.0234500	2
Antonio Raymondi	86.52	78.80	MACROREGION NORTE	18.93	91.15	98.92	8.60	2	41.2	-0.9712663	-0.8101144	2
Asunción	77.57	77.92	MACROREGION NORTE	31.48	90.62	97.28	3.89	3	21.4	-0.1488779	-0.9492707	2
Bolognesi	65.82	78.67	MACROREGION NORTE	31.27	79.33	68.30	10.00	4	40.3	-0.2267558	-0.8433548	1
Carhuaz	68.62	77.41	MACROREGION NORTE	25.27	86.42	40.39	5.20	3	41.0	-0.5957927	-0.8183556	2
Carlos F. Fitzcarrald	79.20	76.07	MACROREGION NORTE	15.34	85.79	82.72	7.03	2	41.6	-1.0855929	-0.9219609	2
Casma	47.15	65.25	MACROREGION NORTE	61.48	64.82	23.40	2.45	4	52.5	0.4294802	0.4792701	1
Corongo	59.82	70.92	MACROREGION NORTE	23.18	79.60	104.78	12.47	2	30.0	-0.7182899	-0.8348002	2

fviz_silhouette(res.agnes,print.summary = F)

# Escalar las variables seleccionadas
dataClus <- scale(data[, c("SERV_GAS", "SERV_AGUA", "IND_CORR", "POBR_MON")])

# Realizar la clusterización jerárquica aglomerativa (AGNES)
set.seed(123)
res.agnes <- hcut(
  dataClus,                # Datos escalados
  k = 3,                   # Número de clusters deseados
  hc_func = "agnes",       # Método aglomerativo
  hc_method = "ward.D"     # Linkage ward
)

# Visualizar el dendrograma en formato horizontal
fviz_dend(
  res.agnes,               # Objeto del resultado de hcut
  cex = 0.7,               # Tamaño del texto en el dendrograma
  horiz = TRUE,            # Dendrograma horizontal
  main = "Dendrograma de Clusterización Jerárquica (AGNES)"
)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Crear un data frame con la información de las siluetas
silAGNES <- data.frame(res.agnes$silinfo$widths)

# Agregar las etiquetas de las filas (observaciones, por ejemplo, provincias)
silAGNES$observation <- row.names(silAGNES)

# Identificar las observaciones con un ancho de silueta negativo
poorAGNES <- silAGNES[silAGNES$sil_width < 0, 'observation'] %>% sort()

# Mostrar las observaciones con valores de silueta negativos
poorAGNES

##  [1] "11"  "137" "153" "156" "157" "170" "21"  "24"  "46"  "48"  "61"  "71" 
## [13] "79"  "89"

#ANALISIS UNIVARIADO

table(data$MCR)

## 
##       LIMA Y CALLAO  MACROREGION CENTRO   MACROREGION NORTE MACROREGION ORIENTE 
##                  11                  53                  59                  29 
##     MACROREGION SUR 
##                  44

median(data$PORC_GAN,na.rm=T)

## [1] 68.025

median(data$POR_POB_ELEC,na.rm=T)

## [1] 79.135

median(data$SERV_GAS,na.rm=T)

## [1] 45.08

median(data$SERV_AGUA,na.rm=T)

## [1] 74.17

median(data$EDUC_PBL,na.rm=T)

## [1] 68.005

median(data$CP_MED,na.rm=T)

## [1] 7.05

median(data$IND_CORR,na.rm=T)

## [1] 41.15

mean(data$PORC_GAN,na.rm=T)

## [1] 67.4046

data %>%                      ## Paso 1: DATA 
  summarize(Promedio=mean(PORC_GAN),Mediana=median(PORC_GAN))  ## Paso 2: Resumir

##   Promedio Mediana
## 1  67.4046  68.025

data %>%                      ## Paso 1: DATA 
  group_by(MCR) %>%    ## Paso 2: group_by
  summarize(Promedio=mean(PORC_GAN),Mediana=median(PORC_GAN)) ## Paso 2:summarize

## # A tibble: 5 × 3
##   MCR                 Promedio Mediana
##   <fct>                  <dbl>   <dbl>
## 1 LIMA Y CALLAO           46.5    45.3
## 2 MACROREGION CENTRO      73.1    79.9
## 3 MACROREGION NORTE       58.7    59.0
## 4 MACROREGION ORIENTE     57.3    59.2
## 5 MACROREGION SUR         84.0    87.1

HISTOGRAMA

library(ggplot2)

ggplot(data, aes(x=PORC_GAN)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = MCR)) +
  geom_bar(fill = 'red')

sd(data$PORC_GAN)

## [1] 18.46623

var(data$PORC_GAN)

## [1] 341.0015

max(data$PORC_GAN)

## [1] 99.03

min(data$PORC_GAN)

## [1] 25.93

range(data$PORC_GAN)

## [1] 25.93 99.03

quantile(data$PORC_GAN)

##      0%     25%     50%     75%    100% 
## 25.9300 52.9650 68.0250 84.0975 99.0300

IQR(data$PORC_GAN)

## [1] 31.1325

ggplot(data, aes(y = PORC_GAN )) + 
  stat_boxplot(geom = "errorbar", # Error bars
               width = 0.25) +    # Bars width
  geom_boxplot()

library(ggplot2)
ggplot(data, aes(x=MCR,y = PORC_GAN)) + 
  geom_boxplot()

library(ggplot2)
ggplot(data, aes(x=MCR,y = PORC_GAN,fill=MCR)) + 
  geom_boxplot()

ggplot(data.frame(data), aes(x = PORC_GAN)) +
       geom_histogram(aes(y = ..density..),
                      color = "gray", fill = "white") +
       geom_density(fill = "black", alpha = 0.2)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data, aes(x = PORC_GAN, fill = MCR , colour = MCR)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  theme(legend.position = "left") # Izquierda

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(dplyr)
data %>%
  ggplot(aes(x = PORC_GAN, group = MCR)) +
  geom_histogram() +
  facet_wrap(~ MCR) +
  labs(x = "Macroregiones", y = "porcentaje en que gano PEDRO CASTILLO")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Trabajo final

Paula Chata

2024-11-15