FINALTEST

Ricardo Asenjo ## FINAL TEST

library(rio)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)
library(polycor)
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:polycor':
## 
##     polyserial

library(matrixcalc)
library(GPArotation)

## 
## Attaching package: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

library(BBmisc)

## 
## Attaching package: 'BBmisc'

## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff

## The following object is masked from 'package:base':
## 
##     isFALSE

data1=import("reporte_com.xlsx")

datapor <- data1 %>%
  mutate(
    PORC_SI_ELE = (COM_SI_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100,  # Porcentaje de "Sí electricidad"
    PORC_NO_ELE = (COM_NO_ELE / (COM_SI_ELE + COM_NO_ELE)) * 100,   # Porcentaje de "No electricidad"
    
    PORC_SI_GLP = (COM_SI_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100,  # Porcentaje de "Sí glp"
    PORC_NO_GLP = (COM_NO_GLP / (COM_SI_GLP + COM_NO_GLP)) * 100,   # Porcentaje de "No glp"
    
    PORC_SI_CAR = (COM_SI_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,  
    PORC_NO_CAR = (COM_NO_CAR / (COM_SI_CAR + COM_NO_CAR)) * 100,
    
    PORC_SI_LE = (COM_SI_LE / (COM_SI_LE + COM_NO_LE)) * 100,  
    PORC_NO_LE = (COM_NO_LE / (COM_SI_LE + COM_NO_LE)) * 100
    
  ) %>%
   select(PROVINCIA, PORC_SI_ELE, PORC_NO_ELE, PORC_SI_GLP, PORC_NO_GLP, PORC_SI_CAR, PORC_NO_CAR, PORC_SI_LE, PORC_NO_LE)

Análisis factorial

Selección de data

dontselect=c("PROVINCIA", "PORC_NO_ELE", "PORC_NO_GLP","PORC_NO_CAR","PORC_NO_LE")
select=setdiff(names(datapor),dontselect) 
theData=datapor[,select]

head(theData,10)%>%
    rmarkdown::paged_table()

corMatrix=polycor::hetcor(theData)$correlations

psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.63
## MSA for each item = 
## PORC_SI_ELE PORC_SI_GLP PORC_SI_CAR  PORC_SI_LE 
##        0.91        0.59        0.72        0.59

KMO mayor a 0.6

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05

## [1] FALSE

False

is.singular.matrix(corMatrix)

## [1] FALSE

False

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

Señala que debe ser un solo factor.

Varimax

resfaVAR <- fa(theData,
            nfactors = 1,
            cor = 'mixed',
            rotate = "varimax", #oblimin?
            fm="minres")
print(resfaVAR$loadings)

## 
## Loadings:
##             MR1   
## PORC_SI_ELE  0.491
## PORC_SI_GLP  0.912
## PORC_SI_CAR  0.280
## PORC_SI_LE  -0.956
## 
##                  MR1
## SS loadings    2.065
## Proportion Var 0.516

OBLIMIN

resfaOBL <- fa(theData,
            nfactors = 1,
            cor = 'mixed',
            rotate = "oblimin", #oblimin?
            fm="minres")
print(resfaOBL$loadings)

## 
## Loadings:
##             MR1   
## PORC_SI_ELE  0.491
## PORC_SI_GLP  0.912
## PORC_SI_CAR  0.280
## PORC_SI_LE  -0.956
## 
##                  MR1
## SS loadings    2.065
## Proportion Var 0.516

Pregunta 2

data2=import("datafinal.xlsx")

datapor2 <- data2 %>%
  mutate(
    PORC_AGU_DEN = (AGU_DEN / AGU_TOTAL) * 100,
    RAZON_VOT = (VOT_FP / VOT_PL),
    TASA= ((FAL_CASOS / COV_CASOS) * 1000)
    
  ) %>%
   select(PROVINCIA, PORC_AGU_DEN, RAZON_VOT, TASA)

datapor2 <- datapor2[datapor2$PROVINCIA != "LIMA", ]

Data lista

boxplot(datapor2[,c(2:4)],horizontal = F,las=2,cex.axis = 0.5)

boxplot(normalize(datapor2[,c(2:4)],method='standardize'))

datapor2[,c(2:4)]=normalize(datapor2[,c(2:4)],method='standardize')

cor(datapor2[,c(2:4)])

##              PORC_AGU_DEN   RAZON_VOT        TASA
## PORC_AGU_DEN    1.0000000 -0.23767178  0.15643461
## RAZON_VOT      -0.2376718  1.00000000 -0.07524669
## TASA            0.1564346 -0.07524669  1.00000000

dataClus=datapor2[,c(2:4)]
row.names(dataClus)=datapor2$PROVINCIA

library(cluster)
g.dist = daisy(dataClus, metric="gower")

library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

K=3

set.seed(123)
res.pam=pam(g.dist,k = K,cluster.only = F)
# nueva columna
dataClus$pam=res.pam$cluster
head(dataClus)

##               PORC_AGU_DEN  RAZON_VOT       TASA pam
## ABANCAY          0.6900947 -0.3540687 -0.5660703   1
## ACOBAMBA         0.2989593  0.6763994 -0.3088953   2
## ACOMAYO          0.7647068  3.0681095  0.6704760   1
## AIJA             1.0330394 -0.4533493 -0.6927374   1
## ALTO AMAZONAS   -0.4256137 -0.4220545  0.1347549   2
## AMBO            -0.5512555 -0.2620915 -0.1916298   2

Jerárquica Aglomerativa

set.seed(123)
library(factoextra)

res.agnes<- hcut(g.dist, k = K,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

fviz_silhouette(res.agnes,print.summary = F)

Técnica jerárquica divisiva

set.seed(123)
res.diana <- hcut(g.dist, k = K,hc_func='diana')
dataClus$diana=res.diana$cluster

fviz_silhouette(res.diana,print.summary = F)

La técnica divisiva clusteriza erróneamente menos provincias que la técnica aglomerativa.

Técnica de partición:

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

set.seed(123)
res.pam=pam(g.dist,3,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

fviz_silhouette(res.pam,print.summary = F)

La técnica de partición clusteriza menos erróneamente, pero tiene un promedio menor.

FINALTEST

2024-11-28

Análisis factorial

Pregunta 2