library(readxl)
data <- read_excel("dataOK_all.xlsx")

## New names:
## • `` -> `...1`

View(data)

Pregunta 1:

str(data)

## tibble [196 × 50] (S3: tbl_df/tbl/data.frame)
##  $ ...1                   : num [1:196] 1 2 3 4 5 6 7 8 9 10 ...
##  $ key                    : chr [1:196] "AMAZONAS+BAGUA" "AMAZONAS+BONGARA" "AMAZONAS+CHACHAPOYAS" "AMAZONAS+CONDORCANQUI" ...
##  $ Código                 : num [1:196] 102 103 101 104 105 106 107 202 203 204 ...
##  $ pared1_Ladrillo        : num [1:196] 4633 1602 3782 291 430 ...
##  $ pared2_Piedra          : num [1:196] 46 9 22 7 7 7 35 1 0 3 ...
##  $ pared3_Adobe           : num [1:196] 6639 2729 5881 672 5217 ...
##  $ pared4_Tapia           : num [1:196] 222 240 2476 8 6052 ...
##  $ pared5_Quincha         : num [1:196] 2518 157 309 386 346 ...
##  $ pared6_Piedra          : num [1:196] 127 36 168 7 54 28 518 65 7 6 ...
##  $ pared7_Madera          : num [1:196] 4484 2505 1270 8145 606 ...
##  $ pared8_Triplay         : num [1:196] 851 30 91 200 45 24 210 18 0 1 ...
##  $ pared9_Otro            : num [1:196] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pared10_Total          : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ techo1_Concreto        : num [1:196] 2187 692 2262 56 187 ...
##  $ techo2_Madera          : num [1:196] 294 75 160 188 43 48 340 57 12 8 ...
##  $ techo3_Tejas           : num [1:196] 179 382 3393 177 3071 ...
##  $ techo4_Planchas        : num [1:196] 13186 6084 8005 2036 9343 ...
##  $ techo5_Caña            : num [1:196] 160 38 50 15 26 15 196 10 8 5 ...
##  $ techo6_Triplay         : num [1:196] 106 5 14 10 12 5 62 17 4 3 ...
##  $ techo7_Paja            : num [1:196] 3408 32 115 7234 75 ...
##  $ techo8_Otro            : num [1:196] 0 0 0 0 0 0 0 0 0 0 ...
##  $ techo9_Total           : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ piso1_Parquet          : num [1:196] 6 5 23 2 4 3 20 0 0 5 ...
##  $ piso2_Láminas          : num [1:196] 19 2 36 0 0 4 32 0 0 1 ...
##  $ piso3_Losetas          : num [1:196] 647 165 1077 20 46 ...
##  $ piso4_Madera           : num [1:196] 157 132 240 1523 295 ...
##  $ piso5_Cemento          : num [1:196] 7121 2917 6189 943 1911 ...
##  $ piso6_Tierra           : num [1:196] 11569 4087 6434 7228 10501 ...
##  $ piso7_Otro             : num [1:196] 1 0 0 0 0 0 0 0 0 0 ...
##  $ piso8_Total            : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ agua1_Red              : num [1:196] 9429 4569 10647 1307 7172 ...
##  $ agua2_Red_fueraVivienda: num [1:196] 4392 1497 1619 867 3097 ...
##  $ agua3_Pilón            : num [1:196] 793 215 184 1003 1112 ...
##  $ agua4_Camión           : num [1:196] 59 0 49 2 0 0 117 0 0 0 ...
##  $ agua5_Pozo             : num [1:196] 1792 474 876 2564 819 ...
##  $ agua6_Manantial        : num [1:196] 270 67 92 431 132 211 471 121 61 27 ...
##  $ agua7_Río              : num [1:196] 2648 388 488 3428 369 ...
##  $ agua8_Otro             : num [1:196] 56 61 24 80 9 29 104 2 1 6 ...
##  $ agua9_Vecino           : num [1:196] 81 37 20 34 47 8 177 9 4 6 ...
##  $ agua10_Total           : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ elec1_Sí               : num [1:196] 13204 6025 12248 1792 10886 ...
##  $ elec2_No               : num [1:196] 6316 1283 1751 7924 1871 ...
##  $ elec3_Total            : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ departamento           : chr [1:196] "AMAZONAS" "AMAZONAS" "AMAZONAS" "AMAZONAS" ...
##  $ provincia              : chr [1:196] "BAGUA" "BONGARA" "CHACHAPOYAS" "CONDORCANQUI" ...
##  $ Castillo               : num [1:196] 25629 8374 15671 13154 12606 ...
##  $ Keiko                  : num [1:196] 10770 5209 10473 1446 7840 ...
##  $ ganaCastillo           : num [1:196] 1 1 1 1 1 1 1 1 1 1 ...
##  $ countPositivos         : num [1:196] 8126 389 2174 3481 456 ...
##  $ countFallecidos        : num [1:196] 462 72 281 111 88 60 336 26 31 21 ...

df_subset <- data[, c("techo9_Total", "pared10_Total", "piso8_Total", "agua10_Total")]

# usaremos:
library(magrittr)
head(df_subset,10)%>%
    rmarkdown::paged_table()

datos_limpios <- df_subset[complete.cases(df_subset), ]

str(df_subset)

## tibble [196 × 4] (S3: tbl_df/tbl/data.frame)
##  $ techo9_Total : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ pared10_Total: num [1:196] 19520 7308 13999 9716 12757 ...
##  $ piso8_Total  : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ agua10_Total : num [1:196] 19520 7308 13999 9716 12757 ...

library(polycor)

## Warning: package 'polycor' was built under R version 4.3.3

install.packages("polycor", dependencies = TRUE)

## Warning: package 'polycor' is in use and will not be installed

packageVersion("polycor")

## [1] '0.8.1'

Pregunta 2

datos2 <- data[, c("provincia", "Castillo", "Keiko","countFallecidos","countPositivos","elec1_Sí")]

datos2$razon_votacion <- datos2$Castillo / datos2$Keiko

datos2$tasa_fallecidos <- (datos2$countFallecidos / datos2$countPositivos) * 1000

datos_sin_lima <- datos2[datos2$provincia != "LIMA", ]

#install.packages("normalize")
library(normalize)

## Warning: package 'normalize' was built under R version 4.3.3

datos_sin_lima[,c(6:8)]=normalize(datos_sin_lima[,c(6:8)],method='standardize')

cor(datos_sin_lima[,c(6:8)])

##                   elec1_Sí razon_votacion tasa_fallecidos
## elec1_Sí         1.0000000     -0.1891775      -0.1086801
## razon_votacion  -0.1891775      1.0000000       0.1532277
## tasa_fallecidos -0.1086801      0.1532277       1.0000000

Preparación de los datos para la clusterización

datos_sin_lima <- as.data.frame(datos_sin_lima)

dataClus=datos_sin_lima[,c(6:8)]
row.names(dataClus)=datos_sin_lima$provincia

library(cluster)
g.dist = daisy(dataClus, metric="gower")

Decidir cantidad de clusters:

#install.packages("factoextra")  # Si aún no está instalado
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.3

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

## PARA JERARQUICO

fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

set.seed(123)
library(factoextra)

res.agnes<- hcut(g.dist, k = 1,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

# ver

#install.packages("kableExtra")
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.3.3

head(dataClus,15)|>kbl()|>kable_styling()

	elec1_Sí	razon_votacion	tasa_fallecidos	agnes
BAGUA	-0.2814385	-0.3512991	-1.1415785	1
BONGARA	-0.4701703	-0.5292981	-0.5365194	1
CHACHAPOYAS	-0.3065712	-0.5549533	-0.7999689	1
CONDORCANQUI	-0.5814535	1.1973385	-1.2593821	1
LUYA	-0.3423774	-0.5292276	-0.4992799	1
RODRÍGUEZ DE MENDOZA	-0.4472985	-0.5654212	1.1638051	1
UTCUBAMBA	0.0127666	-0.4616676	-0.9869613	1
AIJA	-0.5883939	-0.5205757	0.1430353	1
ANTONIO RAYMONDI	-0.5473561	0.5793327	1.2988419	1
ASUNCIÓN	-0.5751440	-0.1026242	0.2695732	1
BOLOGNESI	-0.4872585	-0.4560327	0.4619025	1
CARHUAZ	-0.3565211	-0.3958612	-0.0165583	1
CARLOS FERMÍN FITZCARRALD	-0.5392327	-0.0220210	1.4548719	1
CASMA	-0.3226340	-0.7319343	0.3638289	1
CORONGO	-0.5808225	-0.5566324	1.0130963	1

# Visualize
fviz_dend(res.agnes, cex = 0.7, horiz = T,main = "")

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

aggregate(.~ agnes, data=dataClus,mean)

##   agnes      elec1_Sí razon_votacion tasa_fallecidos
## 1     1 -5.428517e-17  -1.785264e-17   -3.602276e-17

##pregunta 3

datos3 <- data[, c("provincia", "Castillo", "Keiko","countFallecidos","countPositivos","elec1_Sí", "ganaCastillo")]

datos3$tasa_fallecidos <- (datos2$countFallecidos / datos2$countPositivos) * 1000

names(datos3)

## [1] "provincia"       "Castillo"        "Keiko"           "countFallecidos"
## [5] "countPositivos"  "elec1_Sí"        "ganaCastillo"    "tasa_fallecidos"

### semilla
set.seed(2019)
h1=formula(ganaCastillo~elec1_Sí+tasa_fallecidos)

#regression
rlog1=glm(h1, data=datos3,family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

modelrl=list('gana castillo(I)'=rlog1)

#f <- function(x) format(x, digits = 4, scientific = FALSE)
library(modelsummary)

## Warning: package 'modelsummary' was built under R version 4.3.3

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
## 
## Change the default backend persistently:
## 
##   config_modelsummary(factory_default = 'gt')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

modelsummary(modelrl,
             title = "Regresión Logística",
             stars = TRUE,
             output = "kableExtra")

Regresión Logística
	gana castillo(I)
(Intercept)	1.931***
	(0.367)
elec1_Sí	0.000***
	(0.000)
tasa_fallecidos	0.000
	(0.001)
Num.Obs.	196
AIC	191.9
BIC	201.8
Log.Lik.	-92.973
F	5.690
RMSE	0.39
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001

h1=formula(ganaCastillo~elec1_Sí+tasa_fallecidos)

ksdlj

2024-06-19

Pregunta 2