Examen Final

Librerias (cargue todas por si acaso desde el principio)

library(rio)
library(stargazer)
library(tidyverse)
library(stringr)
library(cluster)
library(factoextra)
library(fpc)
library(sf)
library(lmtest)
library(DescTools)
library(polycor)
library(ggcorrplot)
library(psych)
library(matrixcalc)
library(GPArotation)
library(lavaan)
library(BBmisc)
library(htmltab)

EJERCIOCIO 1

lib = htmltab(
  "https://en.wikipedia.org/wiki/List_of_freedom_indices",
  "/html/body/div[3]/div[3]/div[5]/div[1]/table[2]"
)
lib = arrange(lib, Country)
lib$Country = trimws(lib$Country,which=c("right"),whitespace = "[\\h\\v]")
lib$Country=substr(lib$Country,3,100)
lib[1,1]=substr(lib[1,1],5,100)
lib[2,1]=substr(lib[2,1],3,100)
lib = arrange(lib, Country)
lib[lib == "n/a"] = NA
lib = na.omit(lib)
lib$`Freedom in the World 2021` =factor(lib$`Freedom in the World 2021`,
                   levels=c("not free","partly free","free"), 
                   ordered = T)
lib$`2021 Index of Economic Freedom` =factor(lib$`2021 Index of Economic Freedom`,
                   levels=c("repressed","mostly unfree","moderately free","mostly free","free"), 
                   ordered = T)
lib$`2021 Press Freedom Index` =factor(lib$`2021 Press Freedom Index`,
                   levels=c("very serious situation","difficult situation","noticeable problems","satisfactory situation","good situation"), 
                   ordered = T)
lib$`2020 Democracy Index` =factor(lib$`2020 Democracy Index`,
                   levels=c("authoritarian regime","hybrid regime","flawed democracy","full democracy"), 
                   ordered = T)

g.dist = daisy(lib[,-1], metric="gower")
set.seed(123)
pam.resultado=pam(g.dist,5,cluster.only = F)
lib$clustPT=pam.resultado$cluster
set.seed(123)
res.pam=pam(g.dist,k = 3,cluster.only = F)
lib$pam=res.pam$cluster
res.agnes <- hcut(g.dist, k = 3,hc_func='agnes')
lib$agnes=res.agnes$cluster
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
lib$diana=res.diana$cluster
fviz_silhouette(res.pam)

##   cluster size ave.sil.width
## 1       1   48          0.62
## 2       2   54          0.56
## 3       3   59          0.53

fviz_silhouette(res.agnes)

##   cluster size ave.sil.width
## 1       1   47          0.64
## 2       2   57          0.53
## 3       3   57          0.58

## Warning in grid.newpage(): processing of the plot ran out of memory

fviz_silhouette(res.diana)

##   cluster size ave.sil.width
## 1       1   95          0.18
## 2       2    9          0.67
## 3       3   57          0.45

## Warning in grid.newpage(): processing of the plot ran out of memory

EJERCICIO 2

fecha = htmltab(
  "https://en.wikipedia.org/wiki/Democracy_Index",
  "/html/body/div[3]/div[3]/div[5]/div[1]/table[10]"
)
names(fecha) = c("rank","rank2","Country","type","score","score2","elect","gov","part","cult","libs")
fecha = arrange(fecha, Country)
fecha$Country = trimws(fecha$Country,which=c("right"),whitespace = "[\\h\\v]")
fecha$Country=substr(fecha$Country,3,100)
fecha[1,3]=substr(fecha[1,3],5,100)
fecha[2,3]=substr(fecha[2,3],3,100)
fecha = arrange(fecha, Country)
fecha[,c(5:11)]=lapply(fecha[,c(5:11)],as.numeric)
fecha = na.omit(fecha)
fecha = fecha [,-c(1,2,4,5,6)]

felicidad = htmltab(
  "https://en.wikipedia.org/wiki/World_Happiness_Report",
  "/html/body/div[3]/div[3]/div[5]/div[1]/div[11]/table/tbody/tr[2]/td/table"
)
felicidad = arrange(felicidad, `Country or region`)
felicidad$`Country or region` = trimws(felicidad$`Country or region`,which=c("right"),whitespace = "[\\h\\v]")
felicidad$`Country or region`=substr(felicidad$`Country or region`,3,100)
felicidad[1,2]=substr(felicidad[1,2],5,100)
felicidad[2,2]=substr(felicidad[2,2],3,100)
felicidad = arrange(felicidad, `Country or region`)
felicidad[,-c(2)]=lapply(felicidad[,-c(2)],as.numeric)
felicidad = na.omit(felicidad)
felicidad = felicidad[,-c(1,3)]

d = merge(fecha,felicidad, by.x = "Country", by.y = "Country or region")
set.seed(123)
g.dist = daisy(d[,-c(1)], metric="gower")
fviz_nbclust(d[,-c(1)], pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

fviz_nbclust(d[,-c(1)], hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

fviz_nbclust(d[,-c(1)], hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

set.seed(123)
res.pam=pam(g.dist,k = 5,cluster.only = F)
d$pam=res.pam$cluster
res.agnes <- hcut(g.dist, k = 5,hc_func='agnes')
d$agnes=res.agnes$cluster
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
d$diana=res.diana$cluster
fviz_silhouette(res.pam)

##   cluster size ave.sil.width
## 1       1   34          0.19
## 2       2   22          0.22
## 3       3   15          0.22
## 4       4   17          0.30
## 5       5   21          0.33

fviz_silhouette(res.agnes)

##   cluster size ave.sil.width
## 1       1   38          0.21
## 2       2   22          0.23
## 3       3   11          0.31
## 4       4   16          0.31
## 5       5   22          0.32

fviz_silhouette(res.diana)

##   cluster size ave.sil.width
## 1       1   33          0.35
## 2       2   51          0.17
## 3       3   25          0.34

corMatrix=polycor::hetcor(d[,-c(1,13:15)])$correlations
psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.78
## MSA for each item = 
##                        elect                          gov 
##                         0.73                         0.87 
##                         part                         cult 
##                         0.87                         0.81 
##                         libs               GDP per capita 
##                         0.79                         0.76 
##               Social support      Healthy life expectancy 
##                         0.74                         0.88 
## Freedom to make life choices                   Generosity 
##                         0.60                         0.63 
##    Perceptions of corruption 
##                         0.59

cortest.bartlett(corMatrix,n=nrow(d[,-c(1,13:15)]))$p.value>0.05

## [1] FALSE

is.singular.matrix(corMatrix)

## [1] FALSE

fa.parallel(d[,-c(1,13:15)],fm = 'ML', fa = 'fa',correct = T)

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

resfa <- fa(d[,-c(1,13:15)],
            nfactors = 3,
            cor = 'mixed',
            rotate = "varimax",
            fm="minres")
print(resfa$loadings,cutoff = 0.5)

## 
## Loadings:
##                              MR1    MR3    MR2   
## elect                         0.943              
## gov                           0.738              
## part                          0.720              
## cult                          0.590              
## libs                          0.905              
## GDP per capita                       0.903       
## Social support                       0.740       
## Healthy life expectancy              0.824       
## Freedom to make life choices                     
## Generosity                                  0.502
## Perceptions of corruption                   0.765
## 
##                  MR1   MR3   MR2
## SS loadings    3.417 2.603 1.323
## Proportion Var 0.311 0.237 0.120
## Cumulative Var 0.311 0.547 0.667

fa.diagram(resfa)

EJERCICIO 3

a=import("https://github.com/Fabians099/ExamenFinal/blob/main/API_SH.XPD.CHEX.GD.ZS_DS2_en_excel_v2_3360262.xls?raw=true")

## New names:
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * `` -> ...7
## * ...

a = a[-c(1:3),c(1,63)]
names(a) = c("Pais", "sal")
a = na.omit(a)
b=import("https://github.com/Fabians099/ExamenFinal/raw/main/export%20(1).csv")
b = b[,c(1,3)]
b = na.omit(b)
names(b) = c("Pais", "edu")
c=import("https://github.com/Fabians099/ExamenFinal/raw/main/export%20(2).csv")
c = c[,c(1,3)]
names(c) = c("Pais", "mil")
c = na.omit(c)
a = merge(a,b)
a = merge(a,c)
a[,-c(1)]=lapply(a[,-c(1)],as.numeric)

modelo=formula(edu~sal+mil)
regresion=lm(modelo,data=a)
summary(regresion,type = "text")

## 
## Call:
## lm(formula = modelo, data = a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7661 -0.9090 -0.1817  0.9314  4.5709 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.47832    0.38146   6.497 1.45e-09 ***
## sal          0.27174    0.04699   5.783 4.86e-08 ***
## mil          0.05593    0.08377   0.668    0.505    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.44 on 135 degrees of freedom
## Multiple R-squared:  0.1987, Adjusted R-squared:  0.1868 
## F-statistic: 16.73 on 2 and 135 DF,  p-value: 3.221e-07

modelo1=formula(sal~edu+mil)
regresion=lm(modelo1,data=a)
summary(regresion,type = "text")

## 
## Call:
## lm(formula = modelo1, data = a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1289 -1.6840 -0.4144  1.4152 10.1212 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.7400     0.6403   5.841 3.68e-08 ***
## edu           0.7306     0.1263   5.783 4.86e-08 ***
## mil          -0.1686     0.1368  -1.232     0.22    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.361 on 135 degrees of freedom
## Multiple R-squared:  0.2049, Adjusted R-squared:  0.1932 
## F-statistic:  17.4 on 2 and 135 DF,  p-value: 1.891e-07

modelo2=formula(mil~edu+sal)
regresion=lm(modelo2,data=a)
summary(regresion,type = "text")

## 
## Call:
## lm(formula = modelo2, data = a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7444 -0.8761 -0.3499  0.3717  8.8967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.05814    0.41180   4.998 1.77e-06 ***
## edu          0.05885    0.08813   0.668    0.505    
## sal         -0.06598    0.05354  -1.232    0.220    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.477 on 135 degrees of freedom
## Multiple R-squared:  0.01126,    Adjusted R-squared:  -0.003387 
## F-statistic: 0.7688 on 2 and 135 DF,  p-value: 0.4656

Examen Final

Fabian Sanchez

12/13/2021