LIMPIEZA DE DATA 5 ABRIR DATA
library(rio)
library(tidyverse)
library(magrittr)
library(polycor)
library(ggcorrplot)
library(psych)
library(matrixcalc)
data_energia = import("DATA 5_HOGAR_COCINA.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
names(data_energia)
## [1] "Título" "...2" "...3" "...4" "...5" "...6" "...7" "...8" "...9" "...10" "...11" "...12"
## [13] "...13" "...14" "...15"
ELIMINAR FILAS
data_energia <- data_energia[-c(1:4), ]
ELIMINAR COLUMNAS
data_energia <- data_energia[ , -c(1)]
data_energia = data_energia[-c(198:200),]
PASAR DE CHARACTER A NUMERIC
data_energia <- data_energia %>%
mutate(across(c(...4:...15), as.numeric))
## Warning: There were 12 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(...4:...15), as.numeric)`.
## Caused by warning:
## ! NAs introducidos por coerción
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 11 remaining warnings.
QUITAR TILDES
library(scraEP)
data_energia[,2]=sapply(data_energia[,2],unaccent)
data_energia[,2]=sapply(data_energia[,2],toupper)
names(data_energia)[1] = "UBIGEO"
names(data_energia)[2] = "PROVINCIA"
data_energia$PROVINCIA <- gsub("[A-ZÁÉÍÓÚÑ]+, PROVINCIA:\\s*", "", data_energia$PROVINCIA)
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE TAMBOPATA","PROVINCIA"]="TAMBOPATA"
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE MANU","PROVINCIA"]="MANU"
data_energia[data_energia$PROVINCIA=="MADRE DE DIOS PROV. DE TAHUAMANU","PROVINCIA"]="TAHUAMANU"
data_energia[data_energia$PROVINCIA=="PROVINCIA CONSTITUCIONAL DEL CALLAO","PROVINCIA"]="CALLAO"
CAMBIAR LOS NOMBRES DE LAS COLUMNAS
colnames(data_energia)[3] = "NO USA ELECTRICIDAD"
colnames(data_energia)[4] = "SI USA ELECTRICIDAD"
colnames(data_energia)[5] = "TOTAL1"
colnames(data_energia)[6] = "NO USA GAS (BALON GLP)"
colnames(data_energia)[7] = "SI USA GAS (BALON GLP)"
colnames(data_energia)[8] = "TOTAL2"
colnames(data_energia)[9] = "NO USA CARBON"
colnames(data_energia)[10] = "SI USA CARBON"
colnames(data_energia)[11] = "TOTAL3"
colnames(data_energia)[12] = "NO USA LENA"
colnames(data_energia)[13] = "SI USA LENA"
colnames(data_energia)[14] = "TOTAL4"
ELIMINAR FILAS
data_energia <- data_energia[-c(1), ]
SOBRE LA PREGUNTA 1 some libraries needed
library(magrittr)
library(polycor)
library(psych)
library(matrixcalc)
library(GPArotation)
library(BBmisc)
library(rio)
library(tidyverse)
read data
names(data_energia)
## [1] "UBIGEO" "PROVINCIA" "NO USA ELECTRICIDAD" "SI USA ELECTRICIDAD"
## [5] "TOTAL1" "NO USA GAS (BALON GLP)" "SI USA GAS (BALON GLP)" "TOTAL2"
## [9] "NO USA CARBON" "SI USA CARBON" "TOTAL3" "NO USA LENA"
## [13] "SI USA LENA" "TOTAL4"
str(data_energia)
## 'data.frame': 196 obs. of 14 variables:
## $ UBIGEO : chr "101" "102" "103" "104" ...
## $ PROVINCIA : chr "CHACHAPOYAS" "BAGUA" "BONGARA" "CONDORCANQUI" ...
## $ NO USA ELECTRICIDAD : num 14763 20313 7689 9853 13112 ...
## $ SI USA ELECTRICIDAD : num 574 161 124 14 90 65 255 921 16 33 ...
## $ TOTAL1 : num 15337 20474 7813 9867 13202 ...
## $ NO USA GAS (BALON GLP): num 4696 10557 3154 8331 6863 ...
## $ SI USA GAS (BALON GLP): num 10641 9917 4659 1536 6339 ...
## $ TOTAL2 : num 15337 20474 7813 9867 13202 ...
## $ NO USA CARBON : num 15161 20185 7755 9841 13169 ...
## $ SI USA CARBON : num 176 289 58 26 33 26 335 218 4 4 ...
## $ TOTAL3 : num 15337 20474 7813 9867 13202 ...
## $ NO USA LENA : num 7236 7357 2345 1059 1833 ...
## $ SI USA LENA : num 8101 13117 5468 8808 11369 ...
## $ TOTAL4 : num 15337 20474 7813 9867 13202 ...
data for factorial
dontselect=c("UBIGEO","PROVINCIA","TOTAL1",
"TOTAL2","TOTAL3","TOTAL4")
select=setdiff(names(data_energia),dontselect)
theData=data_energia[,select]
head(data_energia,10)
## UBIGEO PROVINCIA NO USA ELECTRICIDAD SI USA ELECTRICIDAD TOTAL1 NO USA GAS (BALON GLP)
## 6 101 CHACHAPOYAS 14763 574 15337 4696
## 7 102 BAGUA 20313 161 20474 10557
## 8 103 BONGARA 7689 124 7813 3154
## 9 104 CONDORCANQUI 9853 14 9867 8331
## 10 105 LUYA 13112 90 13202 6863
## 11 106 RODRIGUEZ DE MENDOZA 9103 65 9168 5387
## 12 107 UTCUBAMBA 31248 255 31503 15425
## 13 201 HUARAZ 45333 921 46254 15947
## 14 202 AIJA 1989 16 2005 1527
## 15 203 ANTONIO RAYMONDI 3894 33 3927 3151
## SI USA GAS (BALON GLP) TOTAL2 NO USA CARBON SI USA CARBON TOTAL3 NO USA LENA SI USA LENA TOTAL4
## 6 10641 15337 15161 176 15337 7236 8101 15337
## 7 9917 20474 20185 289 20474 7357 13117 20474
## 8 4659 7813 7755 58 7813 2345 5468 7813
## 9 1536 9867 9841 26 9867 1059 8808 9867
## 10 6339 13202 13169 33 13202 1833 11369 13202
## 11 3781 9168 9142 26 9168 1824 7344 9168
## 12 16078 31503 31168 335 31503 10790 20713 31503
## 13 30307 46254 46036 218 46254 24753 21501 46254
## 14 478 2005 2001 4 2005 219 1786 2005
## 15 776 3927 3923 4 3927 351 3576 3927
correlations
corMatrix=polycor::hetcor(theData)$correlations
## Warning in hetcor.data.frame(theData): the correlation matrix has been adjusted to make it positive-definite
previous evaluations
KMO(corMatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = corMatrix)
## Overall MSA = 0.72
## MSA for each item =
## NO USA ELECTRICIDAD SI USA ELECTRICIDAD NO USA GAS (BALON GLP) SI USA GAS (BALON GLP) NO USA CARBON
## 0.77 0.76 0.74 0.74 0.77
## SI USA CARBON NO USA LENA SI USA LENA
## 0.52 0.77 0.36
cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05 ##FALSE
## [1] FALSE
is.singular.matrix(corMatrix) ##TRUE
## [1] TRUE
fa.parallel(theData, fa = 'fa',correct = T,plot = F)
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully
## In factor.scores, the correlation matrix is singular, the pseudo inverse is used
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Parallel analysis suggests that the number of factors = 1 and the number of components = NA
run factor analysis
resfa <- fa(theData,
nfactors = 1,
cor = 'mixed',
rotate = "varimax", #oblimin?
fm="minres")
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, : The estimated weights for the factor
## scores are probably incorrect. Try a different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An ultra-Heywood case was detected.
## Examine the results carefully
## In factor.scores, the correlation matrix is singular, the pseudo inverse is used
see results
print(resfa$loadings)
##
## Loadings:
## MR1
## NO USA ELECTRICIDAD 1.000
## SI USA ELECTRICIDAD 0.987
## NO USA GAS (BALON GLP) 0.983
## SI USA GAS (BALON GLP) 0.996
## NO USA CARBON 0.999
## SI USA CARBON 0.532
## NO USA LENA 0.994
## SI USA LENA 0.370
##
## MR1
## SS loadings 6.337
## Proportion Var 0.792
fa.diagram(resfa,main = "Resultados del EFA")
testing results
sort(resfa$communality) #¿Qué variables aportaron más a los factores?
## SI USA LENA SI USA CARBON NO USA GAS (BALON GLP) SI USA ELECTRICIDAD NO USA LENA
## 0.1366153 0.2833894 0.9657794 0.9735678 0.9883590
## SI USA GAS (BALON GLP) NO USA CARBON NO USA ELECTRICIDAD
## 0.9911427 0.9971055 1.0008799
sort(resfa$complexity) #¿Qué variables contribuyen a la construcción de más de un factor?
## SI USA LENA SI USA ELECTRICIDAD NO USA CARBON NO USA LENA NO USA ELECTRICIDAD
## 1 1 1 1 1
## NO USA GAS (BALON GLP) SI USA GAS (BALON GLP) SI USA CARBON
## 1 1 1
resfa$TLI # ¿Tucker Lewis > 0.9?
## [1] 0.05762794
CONVERTIR A RMD
knitr::spin('DATA 5.R', precious=TRUE)
##
##
## processing file: DATA 5.Rmd
##
|
| | 0%
|
|... | 3%
|
|..... | 6% [unnamed-chunk-35]
|
|........ | 9%
|
|.......... | 11% [unnamed-chunk-36]
|
|............. | 14%
|
|............... | 17% [unnamed-chunk-37]
|
|.................. | 20%
|
|.................... | 23% [unnamed-chunk-38]
|
|....................... | 26%
|
|......................... | 29% [unnamed-chunk-39]
|
|............................ | 31%
|
|............................... | 34% [unnamed-chunk-40]
|
|................................. | 37%
|
|.................................... | 40% [unnamed-chunk-41]
|
|...................................... | 43%
|
|......................................... | 46% [unnamed-chunk-42]
|
|........................................... | 49%
|
|.............................................. | 51% [unnamed-chunk-43]
|
|................................................ | 54%
|
|................................................... | 57% [unnamed-chunk-44]
|
|..................................................... | 60%
|
|........................................................ | 63% [unnamed-chunk-45]
|
|.......................................................... | 66%
|
|............................................................. | 69% [unnamed-chunk-46]
|
|................................................................ | 71%
|
|.................................................................. | 74% [unnamed-chunk-47]
|
|..................................................................... | 77%
|
|....................................................................... | 80% [unnamed-chunk-48]
|
|.......................................................................... | 83%
|
|............................................................................ | 86% [unnamed-chunk-49]
|
|............................................................................... | 89%
|
|................................................................................. | 91% [unnamed-chunk-50]
|
|.................................................................................... | 94%
|
|...................................................................................... | 97% [unnamed-chunk-51]
|
|.........................................................................................| 100%
## output file: DATA 5.md