Ricardo Asenjo ## FINAL TEST

library(rio)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(polycor)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:polycor':
## 
##     polyserial
library(matrixcalc)
library(GPArotation)
## 
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin
library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff
## The following object is masked from 'package:base':
## 
##     isFALSE
data1=import("reporte_com.xlsx")
datapor <- data1 %>%
  mutate(
    PORC_SI_ELE = (COM_SI_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100,  # Porcentaje de "Sí electricidad"
    PORC_NO_ELE = (COM_NO_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100,   # Porcentaje de "No electricidad"
    
    PORC_SI_GLP = (COM_SI_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100,  # Porcentaje de "Sí glp"
    PORC_NO_GLP = (COM_NO_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100,   # Porcentaje de "No glp"
    
    PORC_SI_CAR = (COM_SI_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,  
    PORC_NO_CAR = (COM_NO_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,
    
    PORC_SI_LE = (COM_SI_LE / (COM_SI_LE + COM_NO_LE)) * 100,  
    PORC_NO_LE = (COM_NO_LE / (COM_SI_LE + COM_NO_LE)) * 100
    
  ) %>%
   select(PROVINCIA, PORC_SI_ELE, PORC_NO_ELE, PORC_SI_GLP, PORC_NO_GLP, PORC_SI_CAR, PORC_NO_CAR, PORC_SI_LE, PORC_NO_LE) 

Análisis factorial

Selección de data

dontselect=c("PROVINCIA", "PORC_NO_ELE", "PORC_NO_GLP","PORC_NO_CAR","PORC_NO_LE")
select=setdiff(names(datapor),dontselect) 
theData=datapor[,select]

head(theData,10)%>%
    rmarkdown::paged_table()
corMatrix=polycor::hetcor(theData)$correlations
psych::KMO(corMatrix) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.63
## MSA for each item = 
## PORC_SI_ELE PORC_SI_GLP PORC_SI_CAR  PORC_SI_LE 
##        0.91        0.59        0.72        0.59

KMO mayor a 0.6

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05
## [1] FALSE

False

is.singular.matrix(corMatrix)
## [1] FALSE

False

fa.parallel(theData, fa = 'fa',correct = T,plot = F)
## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

Señala que debe ser un solo factor.

Varimax

resfaVAR <- fa(theData,
            nfactors = 1,
            cor = 'mixed',
            rotate = "varimax", #oblimin?
            fm="minres")
print(resfaVAR$loadings)
## 
## Loadings:
##             MR1   
## PORC_SI_ELE  0.491
## PORC_SI_GLP  0.912
## PORC_SI_CAR  0.280
## PORC_SI_LE  -0.956
## 
##                  MR1
## SS loadings    2.065
## Proportion Var 0.516

OBLIMIN

resfaOBL <- fa(theData,
            nfactors = 1,
            cor = 'mixed',
            rotate = "oblimin", #oblimin?
            fm="minres")
print(resfaOBL$loadings)
## 
## Loadings:
##             MR1   
## PORC_SI_ELE  0.491
## PORC_SI_GLP  0.912
## PORC_SI_CAR  0.280
## PORC_SI_LE  -0.956
## 
##                  MR1
## SS loadings    2.065
## Proportion Var 0.516

Pregunta 2

data2=import("datafinal.xlsx")
datapor2 <- data2 %>%
  mutate(
    PORC_AGU_DEN = (AGU_DEN / AGU_TOTAL) * 100,
    RAZON_VOT = (VOT_FP / VOT_PL),
    TASA= ((FAL_CASOS / COV_CASOS) * 1000)
    
  ) %>%
   select(PROVINCIA, PORC_AGU_DEN, RAZON_VOT, TASA) 
datapor2 <- datapor2[datapor2$PROVINCIA != "LIMA", ]

Data lista

boxplot(datapor2[,c(2:4)],horizontal = F,las=2,cex.axis = 0.5)

boxplot(normalize(datapor2[,c(2:4)],method='standardize'))

datapor2[,c(2:4)]=normalize(datapor2[,c(2:4)],method='standardize')
cor(datapor2[,c(2:4)])
##              PORC_AGU_DEN   RAZON_VOT        TASA
## PORC_AGU_DEN    1.0000000 -0.23767178  0.15643461
## RAZON_VOT      -0.2376718  1.00000000 -0.07524669
## TASA            0.1564346 -0.07524669  1.00000000
dataClus=datapor2[,c(2:4)]
row.names(dataClus)=datapor2$PROVINCIA
library(cluster)
g.dist = daisy(dataClus, metric="gower")
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

K=3
set.seed(123)
res.pam=pam(g.dist,k = K,cluster.only = F)
# nueva columna
dataClus$pam=res.pam$cluster
head(dataClus)
##               PORC_AGU_DEN  RAZON_VOT       TASA pam
## ABANCAY          0.6900947 -0.3540687 -0.5660703   1
## ACOBAMBA         0.2989593  0.6763994 -0.3088953   2
## ACOMAYO          0.7647068  3.0681095  0.6704760   1
## AIJA             1.0330394 -0.4533493 -0.6927374   1
## ALTO AMAZONAS   -0.4256137 -0.4220545  0.1347549   2
## AMBO            -0.5512555 -0.2620915 -0.1916298   2

Jerárquica Aglomerativa

set.seed(123)
library(factoextra)

res.agnes<- hcut(g.dist, k = K,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster
fviz_silhouette(res.agnes,print.summary = F)

Técnica jerárquica divisiva

set.seed(123)
res.diana <- hcut(g.dist, k = K,hc_func='diana')
dataClus$diana=res.diana$cluster
fviz_silhouette(res.diana,print.summary = F)

La técnica divisiva clusteriza erróneamente menos provincias que la técnica aglomerativa.

Técnica de partición:

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
set.seed(123)
res.pam=pam(g.dist,3,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster
fviz_silhouette(res.pam,print.summary = F)

La técnica de partición clusteriza menos erróneamente, pero tiene un promedio menor.