Ricardo Asenjo ## FINAL TEST
library(rio)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(polycor)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:polycor':
##
## polyserial
library(matrixcalc)
library(GPArotation)
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse, symdiff
## The following object is masked from 'package:base':
##
## isFALSE
data1=import("reporte_com.xlsx")
datapor <- data1 %>%
mutate(
PORC_SI_ELE = (COM_SI_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100, # Porcentaje de "Sí electricidad"
PORC_NO_ELE = (COM_NO_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100, # Porcentaje de "No electricidad"
PORC_SI_GLP = (COM_SI_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100, # Porcentaje de "Sí glp"
PORC_NO_GLP = (COM_NO_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100, # Porcentaje de "No glp"
PORC_SI_CAR = (COM_SI_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,
PORC_NO_CAR = (COM_NO_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,
PORC_SI_LE = (COM_SI_LE / (COM_SI_LE + COM_NO_LE)) * 100,
PORC_NO_LE = (COM_NO_LE / (COM_SI_LE + COM_NO_LE)) * 100
) %>%
select(PROVINCIA, PORC_SI_ELE, PORC_NO_ELE, PORC_SI_GLP, PORC_NO_GLP, PORC_SI_CAR, PORC_NO_CAR, PORC_SI_LE, PORC_NO_LE)
Selección de data
dontselect=c("PROVINCIA", "PORC_NO_ELE", "PORC_NO_GLP","PORC_NO_CAR","PORC_NO_LE")
select=setdiff(names(datapor),dontselect)
theData=datapor[,select]
head(theData,10)%>%
rmarkdown::paged_table()
corMatrix=polycor::hetcor(theData)$correlations
psych::KMO(corMatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA = 0.63
## MSA for each item =
## PORC_SI_ELE PORC_SI_GLP PORC_SI_CAR PORC_SI_LE
## 0.91 0.59 0.72 0.59
KMO mayor a 0.6
cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05
## [1] FALSE
False
is.singular.matrix(corMatrix)
## [1] FALSE
False
fa.parallel(theData, fa = 'fa',correct = T,plot = F)
## Parallel analysis suggests that the number of factors = 1 and the number of components = NA
Señala que debe ser un solo factor.
Varimax
resfaVAR <- fa(theData,
nfactors = 1,
cor = 'mixed',
rotate = "varimax", #oblimin?
fm="minres")
print(resfaVAR$loadings)
##
## Loadings:
## MR1
## PORC_SI_ELE 0.491
## PORC_SI_GLP 0.912
## PORC_SI_CAR 0.280
## PORC_SI_LE -0.956
##
## MR1
## SS loadings 2.065
## Proportion Var 0.516
OBLIMIN
resfaOBL <- fa(theData,
nfactors = 1,
cor = 'mixed',
rotate = "oblimin", #oblimin?
fm="minres")
print(resfaOBL$loadings)
##
## Loadings:
## MR1
## PORC_SI_ELE 0.491
## PORC_SI_GLP 0.912
## PORC_SI_CAR 0.280
## PORC_SI_LE -0.956
##
## MR1
## SS loadings 2.065
## Proportion Var 0.516
data2=import("datafinal.xlsx")
datapor2 <- data2 %>%
mutate(
PORC_AGU_DEN = (AGU_DEN / AGU_TOTAL) * 100,
RAZON_VOT = (VOT_FP / VOT_PL),
TASA= ((FAL_CASOS / COV_CASOS) * 1000)
) %>%
select(PROVINCIA, PORC_AGU_DEN, RAZON_VOT, TASA)
datapor2 <- datapor2[datapor2$PROVINCIA != "LIMA", ]
Data lista
boxplot(datapor2[,c(2:4)],horizontal = F,las=2,cex.axis = 0.5)
boxplot(normalize(datapor2[,c(2:4)],method='standardize'))
datapor2[,c(2:4)]=normalize(datapor2[,c(2:4)],method='standardize')
cor(datapor2[,c(2:4)])
## PORC_AGU_DEN RAZON_VOT TASA
## PORC_AGU_DEN 1.0000000 -0.23767178 0.15643461
## RAZON_VOT -0.2376718 1.00000000 -0.07524669
## TASA 0.1564346 -0.07524669 1.00000000
dataClus=datapor2[,c(2:4)]
row.names(dataClus)=datapor2$PROVINCIA
library(cluster)
g.dist = daisy(dataClus, metric="gower")
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
K=3
set.seed(123)
res.pam=pam(g.dist,k = K,cluster.only = F)
# nueva columna
dataClus$pam=res.pam$cluster
head(dataClus)
## PORC_AGU_DEN RAZON_VOT TASA pam
## ABANCAY 0.6900947 -0.3540687 -0.5660703 1
## ACOBAMBA 0.2989593 0.6763994 -0.3088953 2
## ACOMAYO 0.7647068 3.0681095 0.6704760 1
## AIJA 1.0330394 -0.4533493 -0.6927374 1
## ALTO AMAZONAS -0.4256137 -0.4220545 0.1347549 2
## AMBO -0.5512555 -0.2620915 -0.1916298 2
Jerárquica Aglomerativa
set.seed(123)
library(factoextra)
res.agnes<- hcut(g.dist, k = K,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
fviz_silhouette(res.agnes,print.summary = F)
Técnica jerárquica divisiva
set.seed(123)
res.diana <- hcut(g.dist, k = K,hc_func='diana')
dataClus$diana=res.diana$cluster
fviz_silhouette(res.diana,print.summary = F)
La técnica divisiva clusteriza erróneamente menos provincias que la técnica aglomerativa.
Técnica de partición:
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(123)
res.pam=pam(g.dist,3,cluster.only = F)
#nueva columna
dataClus$pam=res.pam$cluster
fviz_silhouette(res.pam,print.summary = F)
La técnica de partición clusteriza menos erróneamente, pero tiene un promedio menor.