Examen-final.knit

##PREGUNTA 1

library(readxl)
data1 <- read_excel("reporte.xlsx")

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`

View(data1)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#limpieza de data :)

data1 <- data1 %>%
  select(-1)

data1 <- data1[-c(1:4, (nrow(data1)-3):nrow(data1)), ]

colnames(data1) <- data1[1, ]
data1 <- data1[-1, ]

data1 <- data1[, -1]

#Porcentajes de los conteos positivos para todos los casos

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

dontselect=c("No usa electricidad","No usa gas (balón GLP)","No usa carbón",
             "No usa leña", "Departamento")
select=setdiff(names(data1),dontselect) 
theData=data1[,select]

library(magrittr)
head(theData,10)%>%
    rmarkdown::paged_table()

library(polycor)

## 
## Attaching package: 'polycor'

## The following object is masked from 'package:psych':
## 
##     polyserial

str(theData)

## tibble [25 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Sí usa electricidad   : chr [1:25] "1283" "6837" "1360" "12878" ...
##  $ Sí usa gas (balón GLP): chr [1:25] "52951" "188443" "56444" "382813" ...
##  $ Sí usa carbón         : chr [1:25] "943" "2408" "471" "1789" ...
##  $ Sí usa leña           : chr [1:25] "74920" "149287" "89197" "58115" ...

theData[] <- lapply(theData, function(x) as.numeric(as.character(x)))

class(theData)

## [1] "tbl_df"     "tbl"        "data.frame"

theData <- as.data.frame(theData)

corMatrix <- hetcor(theData)$correlations
print(corMatrix)

##                        Sí usa electricidad Sí usa gas (balón GLP) Sí usa carbón
## Sí usa electricidad             1.00000000              0.9904857     0.2937747
## Sí usa gas (balón GLP)          0.99048568              1.0000000     0.3352982
## Sí usa carbón                   0.29377471              0.3352982     1.0000000
## Sí usa leña                     0.07598315              0.1468580     0.2094292
##                        Sí usa leña
## Sí usa electricidad     0.07598315
## Sí usa gas (balón GLP)  0.14685797
## Sí usa carbón           0.20942915
## Sí usa leña             1.00000000

round(corMatrix,2)

##                        Sí usa electricidad Sí usa gas (balón GLP) Sí usa carbón
## Sí usa electricidad                   1.00                   0.99          0.29
## Sí usa gas (balón GLP)                0.99                   1.00          0.34
## Sí usa carbón                         0.29                   0.34          1.00
## Sí usa leña                           0.08                   0.15          0.21
##                        Sí usa leña
## Sí usa electricidad           0.08
## Sí usa gas (balón GLP)        0.15
## Sí usa carbón                 0.21
## Sí usa leña                   1.00

library(ggcorrplot)
ggcorrplot(corMatrix)

psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.44
## MSA for each item = 
##    Sí usa electricidad Sí usa gas (balón GLP)          Sí usa carbón 
##                   0.46                   0.46                   0.63 
##            Sí usa leña 
##                   0.13

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05

## [1] FALSE

library(matrixcalc)
is.singular.matrix(corMatrix)

## [1] FALSE

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

RESULTADO: No se puede hacer varimax ni oblimin, fa.parallel indica hacer solo un factor, la correlación es baja y menor a 0.05 en las variables

##PREGUNTA 2

library(readxl)
data2 <- read_excel("dataOK_all.xlsx")

## New names:
## • `` -> `...1`

View(data2)

Utilizando el porcentaje de viviendas que tiene agua de red publica dentro de la vivienda (agua1_Red), la razón de votacion de keiko entre castillo, y la tasa fallecidos por cada 1000 contagiados, sin Lima

data2$agua1_Red=data2$agua1_Red/data2$agua10_Total
data2$CastilloKeiko_ratio=data2$Castillo/data2$Keiko
data2$fallecido_x10000POS=1000*data2$countFallecidos/data2$countPositivos

keptForCluster=c('key','agua1_Red','CastilloKeiko_ratio','fallecido_x10000POS')
data2_small=data2[,keptForCluster]

data2_small=data2_small[!data2_small$key=='LIMA+LIMA',]
boxplot(BBmisc::normalize(data2_small[,-1],method='standardize'))

data2_small_norm=data2_small
data2_small_norm[,-1]=BBmisc::normalize(data2_small[,-1],method='standardize')

cor(data2_small_norm[,-1],use = "complete.obs")

##                      agua1_Red CastilloKeiko_ratio fallecido_x10000POS
## agua1_Red            1.0000000          -0.3226139           0.1035734
## CastilloKeiko_ratio -0.3226139           1.0000000           0.1532277
## fallecido_x10000POS  0.1035734           0.1532277           1.0000000

dataClus=data2_small_norm[,-1]
row.names(dataClus)=data2_small_norm$key

## Warning: Setting row names on a tibble is deprecated.

dataClus=dataClus[complete.cases(dataClus),]
g.dist = cluster::daisy(dataClus, metric="gower")

#Diana

factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

#Agnes

factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

res.agnes<- factoextra::hcut(g.dist, k = 7,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster

library(magrittr)
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'Provincia']%>%sort()
poorAGNES

## NULL

RESULTADO: Agnes (aglomerativa) es mejor porque agrupa en diferentes grupos mientras que Diana no y las agrupa en 1 solo

##PREGUNTA 3 hacer 3 regresiones. Como independientes usará todas las variables de COMMs y ENERGY. Cada regresión usa una variable dependiente diferente del grupo ECON, pero no usa deuda externa.

library(readxl)
data3 <- read_excel("datafinal.xlsx")
View(data3)

Inflation rate (consumer prices) % of inflation Youth unemployment rate (ages 15-24) % of youth unemployment Public debt % public debt Debt - external % public debt - external