Carga de Datos

enlace="https://docs.google.com/spreadsheets/d/e/2PACX-1vQV1DBPnM_ecxxT-71cDu_mtLEveppOZixofs_LSO8bhjr5DUvaOCLGnZ9s0RIUaBuvgqyaCR8Yd9Nu/pub?gid=301662134&single=true&output=csv"

paviData=read.csv(enlace)

Verificando tipos de datos

str(paviData)
## 'data.frame':    1096 obs. of  8 variables:
##  $ poblacioncienmil: num  0.09 0.05 0.12 0.27 0.09 0.02 0.13 0.29 0.12 0.04 ...
##  $ nbi             : num  66.7 86.2 69.7 18.4 52.3 ...
##  $ consejocomunal  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ priorizado      : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ uribista        : int  0 0 1 0 0 0 1 0 0 0 ...
##  $ ejecucion       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pctopo          : num  99.4 96 94.4 94.1 94 ...
##  $ apropiaciondolar: num  0 0 9.22 0.39 0 0 0 0 6.69 0 ...

Formato de tipo de data

# categoricas
paviData[,c(3:6)]=lapply(paviData[,c(3:6)],
                           factor,levels=c(0,1),
                           labels=c('_NO','_SI'))

##check
str(paviData)
## 'data.frame':    1096 obs. of  8 variables:
##  $ poblacioncienmil: num  0.09 0.05 0.12 0.27 0.09 0.02 0.13 0.29 0.12 0.04 ...
##  $ nbi             : num  66.7 86.2 69.7 18.4 52.3 ...
##  $ consejocomunal  : Factor w/ 2 levels "_NO","_SI": 1 1 1 1 1 1 1 1 1 1 ...
##  $ priorizado      : Factor w/ 2 levels "_NO","_SI": 1 1 1 2 1 1 1 1 1 1 ...
##  $ uribista        : Factor w/ 2 levels "_NO","_SI": 1 1 2 1 1 1 2 1 1 1 ...
##  $ ejecucion       : Factor w/ 2 levels "_NO","_SI": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pctopo          : num  99.4 96 94.4 94.1 94 ...
##  $ apropiaciondolar: num  0 0 9.22 0.39 0 0 0 0 6.69 0 ...

Exploracion

skimr::skim(paviData)
Data summary
Name paviData
Number of rows 1096
Number of columns 8
_______________________
Column type frequency:
factor 4
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
consejocomunal 0 1 FALSE 2 _NO: 1036, _SI: 60
priorizado 0 1 FALSE 2 _NO: 820, _SI: 276
uribista 0 1 FALSE 2 _SI: 560, _NO: 536
ejecucion 0 1 FALSE 2 _NO: 1055, _SI: 41

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
poblacioncienmil 0 1 0.40 2.39 0 0.07 0.14 0.26 69.27 ▇▁▁▁▁
nbi 0 1 41.79 19.73 0 27.27 40.42 55.11 98.81 ▃▇▇▃▁
pctopo 0 1 27.70 25.60 0 5.80 19.94 45.70 99.42 ▇▃▂▂▁
apropiaciondolar 0 1 8.28 16.04 0 0.00 0.00 9.39 132.64 ▇▁▁▁▁

Calculos Bivariados

Caso Numerico - Categorico

bi1=formula(apropiaciondolar~priorizado)
boxplot(bi1,data=paviData,horizontal = T)

Las varianzas son similares?

car::leveneTest(bi1,data=paviData)

Hay normalidad en cada grupo

library(dplyr)
library(rstatix) 



paviData %>%
  group_by(priorizado) %>%
  shapiro_test(apropiaciondolar)

Si hubiera normalidad y varianzas similares:

summary(aov(bi1,data=paviData))
##               Df Sum Sq Mean Sq F value Pr(>F)
## priorizado     1      0    0.01       0  0.996
## Residuals   1094 281870  257.65

De lo contrario:

kruskal.test(bi1,data=paviData)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  apropiaciondolar by priorizado
## Kruskal-Wallis chi-squared = 8.5548, df = 1, p-value = 0.003446

Caso Numerico-Numerico

bi2_1=formula(apropiaciondolar~pctopo)

car::scatterplot(bi2_1, data = paviData, regLine=list(col='red'),
                 smooth = FALSE, grid = FALSE, frame = FALSE)

Si se nota relacion lineal:

bi2_1Corr=formula(~apropiaciondolar+pctopo)
cor.test(bi2_1Corr,data=paviData)[c('estimate','p.value')]
## $estimate
##         cor 
## -0.05449567 
## 
## $p.value
## [1] 0.071324

Si no se nota relacion lineal:

cor.test(bi2_1Corr,data=paviData,method='spearman',exact=F)[c('estimate','p.value')]
## $estimate
##         rho 
## -0.03019181 
## 
## $p.value
## [1] 0.3179817

Caso categorico -categorico

# table(paviData$uribista, paviData$priorizado)
qacBase::crosstab(paviData, uribista, priorizado, type = "colpercent", chisquare=TRUE,plot=T)

Calculo de regresion

h1=formula(apropiaciondolar~priorizado+ poblacioncienmil+nbi)
h2=formula(apropiaciondolar~priorizado+ pctopo+uribista+poblacioncienmil+nbi)
h3=formula(apropiaciondolar~priorizado+ pctopo+uribista+consejocomunal+ejecucion +poblacioncienmil+nbi)

r1=lm(h1,data=paviData)
r2=lm(h2,data=paviData)
r3=lm(h3,data=paviData)

Mostrar resultados:

summary(r1)
## 
## Call:
## lm(formula = h1, data = paviData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.356  -8.150  -6.129   1.854  92.349 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      11.48177    1.11883  10.262  < 2e-16 ***
## priorizado_SI    -0.96887    1.05135  -0.922    0.357    
## poblacioncienmil  2.12179    0.19204  11.049  < 2e-16 ***
## nbi              -0.09144    0.02325  -3.933 8.91e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.07 on 1092 degrees of freedom
## Multiple R-squared:  0.1207, Adjusted R-squared:  0.1183 
## F-statistic: 49.97 on 3 and 1092 DF,  p-value: < 2.2e-16
summary(r2)
## 
## Call:
## lm(formula = h2, data = paviData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.094  -8.231  -6.096   1.739  92.067 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      12.56617    1.30820   9.606  < 2e-16 ***
## priorizado_SI    -0.95045    1.05152  -0.904 0.366256    
## pctopo           -0.02769    0.01810  -1.530 0.126206    
## uribista_SI      -0.91267    0.92032  -0.992 0.321570    
## poblacioncienmil  2.12029    0.19208  11.039  < 2e-16 ***
## nbi              -0.08797    0.02347  -3.748 0.000188 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.06 on 1090 degrees of freedom
## Multiple R-squared:  0.1231, Adjusted R-squared:  0.1191 
## F-statistic: 30.61 on 5 and 1090 DF,  p-value: < 2.2e-16
summary(r3)
## 
## Call:
## lm(formula = h3, data = paviData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.326  -7.748  -5.745   1.805  92.492 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       11.42090    1.30075   8.780  < 2e-16 ***
## priorizado_SI     -1.65304    1.05046  -1.574  0.11586    
## pctopo            -0.02633    0.01782  -1.477  0.13988    
## uribista_SI       -0.84467    0.90559  -0.933  0.35117    
## consejocomunal_SI 12.24823    2.02277   6.055 1.93e-09 ***
## ejecucion_SI       2.24939    2.39429   0.939  0.34769    
## poblacioncienmil   1.92848    0.19163  10.064  < 2e-16 ***
## nbi               -0.07427    0.02329  -3.189  0.00147 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.82 on 1088 degrees of freedom
## Multiple R-squared:  0.1527, Adjusted R-squared:  0.1472 
## F-statistic:    28 on 7 and 1088 DF,  p-value: < 2.2e-16

comparando

anova(r1,r2,r3)
library(ggplot2)
library(jtools)

plot_summs(r1,r2,r3,
           model.names = c("H1", "H2", "H3")
           )

Regresion con indicadores

dataPreparada="https://github.com/Estadistica-AnalisisPolitico/Sesion6/raw/main/idhdemo.csv"
idhdemo=read.csv(dataPreparada)
head(idhdemo)
names(idhdemo)
##  [1] "country"          "hdiRanking"       "hdi"              "hdiLife"         
##  [5] "hdiSchoolExpec"   "hdiMeanEduc"      "hdiGni"           "ideRanking"      
##  [9] "ideRegime"        "ide"              "ideElectoral"     "ideFunctioning"  
## [13] "ideParticipation" "ideCulture"       "ideLiberties"
dontselect=c("country","hdiRanking","hdi",
             "ideRanking","ideRegime","ide")
select=setdiff(names(idhdemo),dontselect) 
theData=idhdemo[,select]

corMatrix=polycor::hetcor(theData)$correlations
psych::KMO(corMatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.9
## MSA for each item = 
##          hdiLife   hdiSchoolExpec      hdiMeanEduc           hdiGni 
##             0.90             0.92             0.91             0.88 
##     ideElectoral   ideFunctioning ideParticipation       ideCulture 
##             0.84             0.93             0.96             0.87 
##     ideLiberties 
##             0.88
library(GPArotation)
resfa <- psych::fa(theData,
            nfactors = 2,
            cor = 'mixed',
            rotate = "varimax", #oblimin?
            fm="minres")
print(resfa$loadings,cutoff = 0.49)
## 
## Loadings:
##                  MR1   MR2  
## hdiLife                0.836
## hdiSchoolExpec         0.811
## hdiMeanEduc            0.782
## hdiGni                 0.813
## ideElectoral     0.906      
## ideFunctioning   0.801      
## ideParticipation 0.768      
## ideCulture       0.498      
## ideLiberties     0.901      
## 
##                  MR1   MR2
## SS loadings    3.522 3.280
## Proportion Var 0.391 0.364
## Cumulative Var 0.391 0.756
psych::fa.diagram(resfa,main = "Resultados del EFA")

De la gráfica:

idhdemo$ide_efa=resfa$scores[,1]
idhdemo$idh_efa=resfa$scores[,2]

head(idhdemo)

Verificando:

ideS=formula(~ide+ide_efa)
cor.test(ideS,data=idhdemo)[c('estimate','p.value')]
## $estimate
##       cor 
## 0.9289784 
## 
## $p.value
## [1] 2.946024e-72
idhS=formula(~hdi+idh_efa)
cor.test(idhS,data=idhdemo)[c('estimate','p.value')]
## $estimate
##       cor 
## 0.9276078 
## 
## $p.value
## [1] 1.322231e-71

Pero hay diferencias:

idh_ide_efa=formula(ide_efa~idh_efa)

car::scatterplot(idh_ide_efa, data = idhdemo, regLine=list(col='red'),
                 smooth = FALSE, grid = FALSE, frame = FALSE)

idh_ideEFACorr=formula(~ide_efa+idh_efa)
cor.test(idh_ideEFACorr,data=idhdemo)[c('estimate','p.value')]
## $estimate
##       cor 
## 0.0632508 
## 
## $p.value
## [1] 0.419607
idh_ide=formula(ide~hdi)

car::scatterplot(idh_ide, data = idhdemo, regLine=list(col='red'),
                 smooth = FALSE, grid = FALSE, frame = FALSE)

idh_ideCorr=formula(~ide+hdi)
cor.test(idh_ideCorr,data=idhdemo)[c('estimate','p.value')]
## $estimate
##      cor 
## 0.649831 
## 
## $p.value
## [1] 3.620739e-21
r1=lm(idh_ide_efa,data=idhdemo)
summary(r1)
## 
## Call:
## lm(formula = idh_ide_efa, data = idhdemo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9808 -0.8325  0.3173  0.8105  1.4671 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.819e-17  7.533e-02   0.000     1.00
## idh_efa     6.478e-02  8.006e-02   0.809     0.42
## 
## Residual standard error: 0.9676 on 163 degrees of freedom
## Multiple R-squared:  0.004001,   Adjusted R-squared:  -0.00211 
## F-statistic: 0.6547 on 1 and 163 DF,  p-value: 0.4196
idh_ide=formula(ide~hdi)
r1_noefa=lm(idh_ide,data=idhdemo)
summary(r1_noefa)
## 
## Call:
## lm(formula = idh_ide, data = idhdemo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6195 -1.0091  0.4006  1.2527  3.3699 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8224     0.6613  -2.756  0.00652 ** 
## hdi           9.7393     0.8923  10.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.817 on 163 degrees of freedom
## Multiple R-squared:  0.4223, Adjusted R-squared:  0.4187 
## F-statistic: 119.1 on 1 and 163 DF,  p-value: < 2.2e-16