LIMPIEZA DE DATA 5 ABRIR DATA

library(rio)
library(tidyverse)
library(magrittr)
library(polycor)
library(ggcorrplot)
library(psych)
library(matrixcalc)

data_energia = import("DATA 5_HOGAR_COCINA.xlsx")

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`

names(data_energia)

##  [1] "Título" "...2"   "...3"   "...4"   "...5"   "...6"   "...7"   "...8"   "...9"   "...10"  "...11"  "...12" 
## [13] "...13"  "...14"  "...15"

ELIMINAR FILAS

data_energia <- data_energia[-c(1:4), ]

ELIMINAR COLUMNAS

data_energia <- data_energia[ , -c(1)]
data_energia = data_energia[-c(198:200),]

PASAR DE CHARACTER A NUMERIC

data_energia <- data_energia %>%
  mutate(across(c(...4:...15), as.numeric))

## Warning: There were 12 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(...4:...15), as.numeric)`.
## Caused by warning:
## ! NAs introducidos por coerción
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 11 remaining warnings.

QUITAR TILDES

library(scraEP)
data_energia[,2]=sapply(data_energia[,2],unaccent)
data_energia[,2]=sapply(data_energia[,2],toupper)

names(data_energia)[1] = "UBIGEO"
names(data_energia)[2] = "PROVINCIA"
data_energia$PROVINCIA <- gsub("[A-ZÁÉÍÓÚÑ]+, PROVINCIA:\\s*", "", data_energia$PROVINCIA)
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE TAMBOPATA","PROVINCIA"]="TAMBOPATA"
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE MANU","PROVINCIA"]="MANU"
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE TAHUAMANU","PROVINCIA"]="TAHUAMANU"
data_energia[data_energia$PROVINCIA=="PROVINCIA CONSTITUCIONAL DEL CALLAO","PROVINCIA"]="CALLAO"

CAMBIAR LOS NOMBRES DE LAS COLUMNAS

colnames(data_energia)[3] = "NO USA ELECTRICIDAD"
colnames(data_energia)[4] = "SI USA ELECTRICIDAD"
colnames(data_energia)[5] = "TOTAL1"
colnames(data_energia)[6] = "NO USA GAS (BALON GLP)"
colnames(data_energia)[7] = "SI USA GAS (BALON GLP)"
colnames(data_energia)[8] = "TOTAL2"
colnames(data_energia)[9] = "NO USA CARBON"
colnames(data_energia)[10] = "SI USA CARBON"
colnames(data_energia)[11] = "TOTAL3"
colnames(data_energia)[12] = "NO USA LENA"
colnames(data_energia)[13] = "SI USA LENA"
colnames(data_energia)[14] = "TOTAL4"

ELIMINAR FILAS

data_energia <- data_energia[-c(1), ]

SOBRE LA PREGUNTA 1 some libraries needed

library(magrittr)
library(polycor)
library(psych)
library(matrixcalc)
library(GPArotation)
library(BBmisc)
library(rio)
library(tidyverse)

read data

names(data_energia)

##  [1] "UBIGEO"                 "PROVINCIA"              "NO USA ELECTRICIDAD"    "SI USA ELECTRICIDAD"   
##  [5] "TOTAL1"                 "NO USA GAS (BALON GLP)" "SI USA GAS (BALON GLP)" "TOTAL2"                
##  [9] "NO USA CARBON"          "SI USA CARBON"          "TOTAL3"                 "NO USA LENA"           
## [13] "SI USA LENA"            "TOTAL4"

str(data_energia)

## 'data.frame': 196 obs. of  14 variables:
##  $ UBIGEO                : chr  "101" "102" "103" "104" ...
##  $ PROVINCIA             : chr  "CHACHAPOYAS" "BAGUA" "BONGARA" "CONDORCANQUI" ...
##  $ NO USA ELECTRICIDAD   : num  14763 20313 7689 9853 13112 ...
##  $ SI USA ELECTRICIDAD   : num  574 161 124 14 90 65 255 921 16 33 ...
##  $ TOTAL1                : num  15337 20474 7813 9867 13202 ...
##  $ NO USA GAS (BALON GLP): num  4696 10557 3154 8331 6863 ...
##  $ SI USA GAS (BALON GLP): num  10641 9917 4659 1536 6339 ...
##  $ TOTAL2                : num  15337 20474 7813 9867 13202 ...
##  $ NO USA CARBON         : num  15161 20185 7755 9841 13169 ...
##  $ SI USA CARBON         : num  176 289 58 26 33 26 335 218 4 4 ...
##  $ TOTAL3                : num  15337 20474 7813 9867 13202 ...
##  $ NO USA LENA           : num  7236 7357 2345 1059 1833 ...
##  $ SI USA LENA           : num  8101 13117 5468 8808 11369 ...
##  $ TOTAL4                : num  15337 20474 7813 9867 13202 ...

data for factorial

dontselect=c("UBIGEO","PROVINCIA","TOTAL1",
  "TOTAL2","TOTAL3","TOTAL4")
select=setdiff(names(data_energia),dontselect)
theData=data_energia[,select]
head(data_energia,10)

##    UBIGEO            PROVINCIA NO USA ELECTRICIDAD SI USA ELECTRICIDAD TOTAL1 NO USA GAS (BALON GLP)
## 6     101          CHACHAPOYAS               14763                 574  15337                   4696
## 7     102                BAGUA               20313                 161  20474                  10557
## 8     103              BONGARA                7689                 124   7813                   3154
## 9     104         CONDORCANQUI                9853                  14   9867                   8331
## 10    105                 LUYA               13112                  90  13202                   6863
## 11    106 RODRIGUEZ DE MENDOZA                9103                  65   9168                   5387
## 12    107            UTCUBAMBA               31248                 255  31503                  15425
## 13    201               HUARAZ               45333                 921  46254                  15947
## 14    202                 AIJA                1989                  16   2005                   1527
## 15    203     ANTONIO RAYMONDI                3894                  33   3927                   3151
##    SI USA GAS (BALON GLP) TOTAL2 NO USA CARBON SI USA CARBON TOTAL3 NO USA LENA SI USA LENA TOTAL4
## 6                   10641  15337         15161           176  15337        7236        8101  15337
## 7                    9917  20474         20185           289  20474        7357       13117  20474
## 8                    4659   7813          7755            58   7813        2345        5468   7813
## 9                    1536   9867          9841            26   9867        1059        8808   9867
## 10                   6339  13202         13169            33  13202        1833       11369  13202
## 11                   3781   9168          9142            26   9168        1824        7344   9168
## 12                  16078  31503         31168           335  31503       10790       20713  31503
## 13                  30307  46254         46036           218  46254       24753       21501  46254
## 14                    478   2005          2001             4   2005         219        1786   2005
## 15                    776   3927          3923             4   3927         351        3576   3927

correlations

corMatrix=polycor::hetcor(theData)$correlations

## Warning in hetcor.data.frame(theData): the correlation matrix has been adjusted to make it positive-definite

previous evaluations

KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = corMatrix)
## Overall MSA =  0.72
## MSA for each item = 
##    NO USA ELECTRICIDAD    SI USA ELECTRICIDAD NO USA GAS (BALON GLP) SI USA GAS (BALON GLP)          NO USA CARBON 
##                   0.77                   0.76                   0.74                   0.74                   0.77 
##          SI USA CARBON            NO USA LENA            SI USA LENA 
##                   0.52                   0.77                   0.36

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05 ##FALSE

## [1] FALSE

is.singular.matrix(corMatrix) ##TRUE

## [1] TRUE

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully

## In factor.scores, the correlation matrix is singular, the pseudo inverse is  used

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

run factor analysis

resfa <- fa(theData,
  nfactors = 1,
  cor = 'mixed',
  rotate = "varimax", #oblimin?
  fm="minres")

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect.  Try a different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully

## In factor.scores, the correlation matrix is singular, the pseudo inverse is  used

see results

print(resfa$loadings)

## 
## Loadings:
##                        MR1  
## NO USA ELECTRICIDAD    1.000
## SI USA ELECTRICIDAD    0.987
## NO USA GAS (BALON GLP) 0.983
## SI USA GAS (BALON GLP) 0.996
## NO USA CARBON          0.999
## SI USA CARBON          0.532
## NO USA LENA            0.994
## SI USA LENA            0.370
## 
##                  MR1
## SS loadings    6.337
## Proportion Var 0.792

fa.diagram(resfa,main = "Resultados del EFA")

plot of chunk unnamed-chunk-15

testing results

sort(resfa$communality) #¿Qué variables aportaron más a los factores?

##            SI USA LENA          SI USA CARBON NO USA GAS (BALON GLP)    SI USA ELECTRICIDAD            NO USA LENA 
##              0.1366153              0.2833894              0.9657794              0.9735678              0.9883590 
## SI USA GAS (BALON GLP)          NO USA CARBON    NO USA ELECTRICIDAD 
##              0.9911427              0.9971055              1.0008799

sort(resfa$complexity) #¿Qué variables contribuyen a la construcción de más de un factor?

##            SI USA LENA    SI USA ELECTRICIDAD          NO USA CARBON            NO USA LENA    NO USA ELECTRICIDAD 
##                      1                      1                      1                      1                      1 
## NO USA GAS (BALON GLP) SI USA GAS (BALON GLP)          SI USA CARBON 
##                      1                      1                      1

resfa$TLI # ¿Tucker Lewis > 0.9?

## [1] 0.05762794

CONVERTIR A RMD

knitr::spin('DATA 5.R', precious=TRUE)

## 
## 
## processing file: DATA 5.Rmd

## 
  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |...                                                                                      |   3%                   
  |                                                                                               
  |.....                                                                                    |   6% [unnamed-chunk-35]
  |                                                                                               
  |........                                                                                 |   9%                   
  |                                                                                               
  |..........                                                                               |  11% [unnamed-chunk-36]
  |                                                                                               
  |.............                                                                            |  14%                   
  |                                                                                               
  |...............                                                                          |  17% [unnamed-chunk-37]
  |                                                                                               
  |..................                                                                       |  20%                   
  |                                                                                               
  |....................                                                                     |  23% [unnamed-chunk-38]
  |                                                                                               
  |.......................                                                                  |  26%                   
  |                                                                                               
  |.........................                                                                |  29% [unnamed-chunk-39]
  |                                                                                               
  |............................                                                             |  31%                   
  |                                                                                               
  |...............................                                                          |  34% [unnamed-chunk-40]
  |                                                                                               
  |.................................                                                        |  37%                   
  |                                                                                               
  |....................................                                                     |  40% [unnamed-chunk-41]
  |                                                                                               
  |......................................                                                   |  43%                   
  |                                                                                               
  |.........................................                                                |  46% [unnamed-chunk-42]
  |                                                                                               
  |...........................................                                              |  49%                   
  |                                                                                               
  |..............................................                                           |  51% [unnamed-chunk-43]
  |                                                                                               
  |................................................                                         |  54%                   
  |                                                                                               
  |...................................................                                      |  57% [unnamed-chunk-44]
  |                                                                                               
  |.....................................................                                    |  60%                   
  |                                                                                               
  |........................................................                                 |  63% [unnamed-chunk-45]
  |                                                                                               
  |..........................................................                               |  66%                   
  |                                                                                               
  |.............................................................                            |  69% [unnamed-chunk-46]
  |                                                                                               
  |................................................................                         |  71%                   
  |                                                                                               
  |..................................................................                       |  74% [unnamed-chunk-47]
  |                                                                                               
  |.....................................................................                    |  77%                   
  |                                                                                               
  |.......................................................................                  |  80% [unnamed-chunk-48]
  |                                                                                               
  |..........................................................................               |  83%                   
  |                                                                                               
  |............................................................................             |  86% [unnamed-chunk-49]
  |                                                                                               
  |...............................................................................          |  89%                   
  |                                                                                               
  |.................................................................................        |  91% [unnamed-chunk-50]
  |                                                                                               
  |....................................................................................     |  94%                   
  |                                                                                               
  |......................................................................................   |  97% [unnamed-chunk-51]
  |                                                                                               
  |.........................................................................................| 100%

## output file: DATA 5.md