Voy a hacer el tratamiento de la base de datos como yo lo haria para un trabajo final

Usare la pregunta P23 ?? Usted voto en las pasadas elecciones del 1?? de julio de 2012 para diputados federales?

Primero hay que recodificar algunas variables:

setwd("/Users/mau/Dropbox/Cursos mau/Maestria CEI")
data<-read.csv("BaseIP3.csv",header=TRUE)
library(psych)
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.10
## Current Matrix version is 1.2.7.1
## Please re-install 'TMB' from source or restore original 'Matrix' package
## #refugeeswelcome
data$voto<-ifelse(data$p23=="Si voto",1,0)
data$voto<-ifelse(data$p23=="NS/NC",NA,data$voto)
table(data$voto)
## 
##    0    1 
## 2401 8530
table(data$p21_2)
## 
##      A veces Muchas veces        NS/NC        Nunca  Pocas veces 
##         1275         1764           30          180          539 
##      Siempre 
##         7212
data$libpol<-ifelse(data$p21_2=="Siempre" | data$p21_2=="Muchas veces",1,0)
data$libpol<-ifelse(data$p21_2=="NS/NC",NA,data$libpol)
table(data$libpol)
## 
##    0    1 
## 1994 8976
table(data$p27_3)
## 
## No ha participado             Ns/Nc Si ha participado 
##              9542               117              1341
data$parpol<-ifelse(data$p27_3=="Si ha participado",1,0)
data$parpol<-ifelse(data$p27_3=="Ns/Nc",NA,data$parpol)
table(data$parpol)
## 
##    0    1 
## 9542 1341
table(data$p27_7)
## 
## No ha participado             Ns/Nc Si ha participado 
##             10385               119               496
data$protesta<-ifelse(data$p27_7=="Si ha participado",1,0)
data$protesta<-ifelse(data$p27_7=="Ns/Nc",NA,data$protesta)
table(data$protesta)
## 
##     0     1 
## 10385   496
table(data$p26)
## 
##          En la democracia las reglas son iguales para todos 
##                                                        3026 
##            En la democracia muchos participan y pocos ganan 
##                                                        5362 
## En la democracia todos colaboran para lograr un mismo objet 
##                                                        2112 
##                                                          NC 
##                                                         185 
##                                                          NS 
##                                                         315
data$percdem<-data$p26
data$percdem<-ifelse(data$p26=="NC",NA,data$percdem)
data$percdem<-ifelse(data$p26=="NS",NA,data$percdem)
table(data$percdem)
## 
##    1    2    3 
## 3026 5362 2112
data$percdem<-factor(data$percdem,levels=c(1,2,3),labels=c("Mismas_reglas","Pocos_ganan","Todos_colab"))
table(data$percdem)
## 
## Mismas_reglas   Pocos_ganan   Todos_colab 
##          3026          5362          2112
data$mujer<-ifelse(data$s1=="Mujer",1,0)
table(data$mujer)
## 
##    0    1 
## 4858 6142
table(data$s2)
## 
##    18    19    20    21    22    23    24    25    26    27    28    29 
##   277   255   253   226   241   240   252   226   185   213   241   237 
##    30    31    32    33    34    35    36    37    38    39    40    41 
##   285   154   228   212   191   204   244   216   248   218   335   135 
##    42    43    44    45    46    47    48    49    50    51    52    53 
##   299   207   178   280   165   185   274   200   274   147   228   192 
##    54    55    56    57    58    59    60    61    62    63    64    65 
##   184   151   200   167   177   139   174    95   126   119    85   135 
##    66    67    68    69    70    71    72    73    74    75    76    77 
##    85    87   110    53   117    45    69    53    61    56    64    36 
##    78    79    80    81    82    83    84    85    86    87    88    89 
##    39    36    47    15    14    27    23    15     9    12     5     7 
##    90    91    92    93    94    96    97    98 NS/NC 
##     5     3     2     1     2     2     1     1     1
data$edad<-as.numeric(as.character(data$s2))
## Warning: NAs introducidos por coerci'on
table(data$edad)
## 
##  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35 
## 277 255 253 226 241 240 252 226 185 213 241 237 285 154 228 212 191 204 
##  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53 
## 244 216 248 218 335 135 299 207 178 280 165 185 274 200 274 147 228 192 
##  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71 
## 184 151 200 167 177 139 174  95 126 119  85 135  85  87 110  53 117  45 
##  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89 
##  69  53  61  56  64  36  39  36  47  15  14  27  23  15   9  12   5   7 
##  90  91  92  93  94  96  97  98 
##   5   3   2   1   2   2   1   1
table(data$s3)
## 
##       Carrera Tecnica                    NC                    NS 
##                   713                    21                     4 
##               Ninguno          Preparatoria     Primaria completa 
##                   570                  1805                  1788 
##   Primaria incompleta   Secundaria completa Secundaria incompleta 
##                  1363                  2640                   787 
##     Universidad y mas 
##                  1309
data$escol<-0
data$escol<-ifelse(data$s3=="Ninguno" | data$s3=="Primaria incompleta",1,data$escol)
data$escol<-ifelse(data$s3=="Primaria completa" | data$s3=="Secundaria incompleta",2,data$escol)
data$escol<-ifelse(data$s3=="Secundaria completa",3,data$escol)
data$escol<-ifelse(data$s3=="Preparatoria" | data$s3=="Carrera Tecnica",4,data$escol)
data$escol<-ifelse(data$s3=="Universidad y mas",5,data$escol)
data$escol<-ifelse(data$s3=="NC" | data$s3=="NS",NA,data$escol)
table(data$escol)
## 
##    1    2    3    4    5 
## 1933 2575 2640 2518 1309
data$escol<-factor(data$escol,levels=c(1,2,3,4,5),
                      labels=c("Prim_incom","Primaria","Secundaria","Bachillerato","Superior"))
table(data$escol)
## 
##   Prim_incom     Primaria   Secundaria Bachillerato     Superior 
##         1933         2575         2640         2518         1309
table(data$s11)
## 
## NS/NC    No    Si 
##    54  7156  3790
data$benpp<-ifelse(data$s11=="Si",1,0)
data$benpp<-ifelse(data$s11=="NS/NC",NA,data$benpp)
table(data$benpp)
## 
##    0    1 
## 7156 3790

Supongamos que estamos 100% seguros de que esas seran nuestas variables finales, podemos pasarlas a una nueva base, asi evitamos estar “cargados” de otras variables que pueden estorbar un poco

names<-c("voto","libpol","parpol","protesta","percdem","mujer","edad","escol","benpp")
data2<-data[names]

Veamos como estan nuestras nuevas variables

# truco para saber cuantos NA's tenemos en nuestras variables...

for (Var in names(data2)) {
    missing <- sum(is.na(data2[, Var]))
    if (missing > 0) {
        print(c(Var, missing))
    }
}
## [1] "voto" "69"  
## [1] "libpol" "30"    
## [1] "parpol" "117"   
## [1] "protesta" "119"     
## [1] "percdem" "500"    
## [1] "edad" "1"   
## [1] "escol" "25"   
## [1] "benpp" "54"

Para quedarme solo con la base con casos validos, uso el comando complete.cases

dataf<-data2[complete.cases(data2),]

# Checando de nuevo los missing 
for (Var in names(dataf)) {
    missing <- sum(is.na(dataf[, Var]))
    if (missing > 0) {
        print(c(Var, missing))
    }
}

Primero haremos algunos descriptivos,

prop.table(table(dataf$libpol,dataf$voto),1)
##    
##             0         1
##   0 0.2624799 0.7375201
##   1 0.2072191 0.7927809
prop.table(table(dataf$parpol,dataf$voto),1)
##    
##             0         1
##   0 0.2324748 0.7675252
##   1 0.1108491 0.8891509
prop.table(table(dataf$protesta,dataf$voto),1)
##    
##             0         1
##   0 0.2201109 0.7798891
##   1 0.1587983 0.8412017
prop.table(table(dataf$percdem,dataf$voto),1)
##                
##                         0         1
##   Mismas_reglas 0.2189531 0.7810469
##   Pocos_ganan   0.2234935 0.7765065
##   Todos_colab   0.1990172 0.8009828
prop.table(table(dataf$mujer,dataf$voto),1)
##    
##             0         1
##   0 0.2205425 0.7794575
##   1 0.2147616 0.7852384
describeBy(dataf$edad,group=dataf$voto)
## 
##  Descriptive statistics by group 
## group: 0
##    vars    n  mean    sd median trimmed   mad min max range skew kurtosis
## X1    1 2217 38.12 16.67     35   36.48 19.27  18  93    75 0.68    -0.37
##      se
## X1 0.35
## -------------------------------------------------------- 
## group: 1
##    vars    n  mean    sd median trimmed   mad min max range skew kurtosis
## X1    1 7985 43.78 15.94     42    42.9 17.79  18  98    80 0.42    -0.49
##      se
## X1 0.18
prop.table(table(dataf$escol,dataf$voto),1)
##               
##                        0         1
##   Prim_incom   0.1983759 0.8016241
##   Primaria     0.2144649 0.7855351
##   Secundaria   0.2242424 0.7757576
##   Bachillerato 0.2485331 0.7514669
##   Superior     0.1746939 0.8253061
prop.table(table(dataf$benpp,dataf$voto),1)
##    
##             0         1
##   0 0.2306769 0.7693231
##   1 0.1921447 0.8078553
prop.test(table(dataf$libpol,dataf$voto))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  table(dataf$libpol, dataf$voto)
## X-squared = 27.017, df = 1, p-value = 2.017e-07
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.03314155 0.07738001
## sample estimates:
##    prop 1    prop 2 
## 0.2624799 0.2072191
prop.test(table(dataf$parpol,dataf$voto))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  table(dataf$parpol, dataf$voto)
## X-squared = 96.122, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1018269 0.1414246
## sample estimates:
##    prop 1    prop 2 
## 0.2324748 0.1108491
prop.test(table(dataf$protesta,dataf$voto))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  table(dataf$protesta, dataf$voto)
## X-squared = 9.4719, df = 1, p-value = 0.002086
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.02599906 0.09662623
## sample estimates:
##    prop 1    prop 2 
## 0.2201109 0.1587983
prop.test(table(dataf$percdem,dataf$voto))
## 
##  3-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(dataf$percdem, dataf$voto)
## X-squared = 5.2289, df = 2, p-value = 0.07321
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.2189531 0.2234935 0.1990172
prop.test(table(dataf$mujer,dataf$voto))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  table(dataf$mujer, dataf$voto)
## X-squared = 0.46072, df = 1, p-value = 0.4973
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.01055437  0.02211616
## sample estimates:
##    prop 1    prop 2 
## 0.2205425 0.2147616
t.test(dataf$edad~dataf$voto)
## 
##  Welch Two Sample t-test
## 
## data:  dataf$edad by dataf$voto
## t = -14.278, df = 3422.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.437189 -4.882776
## sample estimates:
## mean in group 0 mean in group 1 
##        38.11547        43.77545
prop.test(table(dataf$escol,dataf$voto))
## 
##  5-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(dataf$escol, dataf$voto)
## X-squared = 31.203, df = 4, p-value = 2.783e-06
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4    prop 5 
## 0.1983759 0.2144649 0.2242424 0.2485331 0.1746939
prop.test(table(dataf$benpp,dataf$voto))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  table(dataf$benpp, dataf$voto)
## X-squared = 19.95, df = 1, p-value = 7.948e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.02185968 0.05520472
## sample estimates:
##    prop 1    prop 2 
## 0.2306769 0.1921447
cuadro

cuadro

Ahora si, podemos hacer un modelo logistico Siempre me gusta empezar con un modelo nulo…

model0<-glm(voto~1,data=dataf,family=binomial)
summary(model0)
## 
## Call:
## glm(formula = voto ~ 1, family = binomial, data = dataf)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.747   0.700   0.700   0.700   0.700  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.28141    0.02401   53.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10681  on 10201  degrees of freedom
## Residual deviance: 10681  on 10201  degrees of freedom
## AIC: 10683
## 
## Number of Fisher Scoring iterations: 4

Ahora vamos a meter nuestras variables independientes

model1<-glm(voto~libpol+parpol+protesta,data=dataf,family=binomial)
summary(model1)
## 
## Call:
## glm(formula = voto ~ libpol + parpol + protesta, family = binomial, 
##     data = dataf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1951   0.4791   0.7104   0.7104   0.8105  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.94472    0.05360  17.624  < 2e-16 ***
## libpol       0.30348    0.05954   5.097 3.45e-07 ***
## parpol       0.85865    0.09388   9.146  < 2e-16 ***
## protesta     0.20822    0.13210   1.576    0.115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10681  on 10201  degrees of freedom
## Residual deviance: 10543  on 10198  degrees of freedom
## AIC: 10551
## 
## Number of Fisher Scoring iterations: 4

?? Que nos dice este modelo? ?? Como se interprtan esos valores?

A mi me gusta pensar en razones de momios… odds ratio … Entonces

exp(coef(model1))
## (Intercept)      libpol      parpol    protesta 
##    2.572098    1.354569    2.359974    1.231481
exp(cbind(OR=coef(model1),confint(model1)))
## Waiting for profiling to be done...
##                   OR     2.5 %   97.5 %
## (Intercept) 2.572098 2.3173679 2.859389
## libpol      1.354569 1.2046166 1.521359
## parpol      2.359974 1.9701942 2.847389
## protesta    1.231481 0.9562589 1.606187

Vamos a meter otra variable En este caso la variable percepcion democracia es nominal con mas de dos categorias, como ya la definimos como “factor” en R, no seria necesario usar la instruccion en el veamos…

model2<-glm(voto~libpol+parpol+protesta+factor(percdem),data=dataf,family=binomial)
summary(model2)
## 
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + factor(percdem), 
##     family = binomial, data = dataf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2383   0.4811   0.7132   0.7216   0.8217  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 0.93873    0.06533  14.368  < 2e-16 ***
## libpol                      0.30050    0.05958   5.044 4.56e-07 ***
## parpol                      0.85887    0.09390   9.146  < 2e-16 ***
## protesta                    0.20736    0.13216   1.569    0.117    
## factor(percdem)Pocos_ganan -0.02642    0.05603  -0.471    0.637    
## factor(percdem)Todos_colab  0.11444    0.07174   1.595    0.111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10681  on 10201  degrees of freedom
## Residual deviance: 10539  on 10196  degrees of freedom
## AIC: 10551
## 
## Number of Fisher Scoring iterations: 4
model2<-glm(voto~libpol+parpol+protesta+percdem,data=dataf,family=binomial)
summary(model2)
## 
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + percdem, family = binomial, 
##     data = dataf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2383   0.4811   0.7132   0.7216   0.8217  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.93873    0.06533  14.368  < 2e-16 ***
## libpol              0.30050    0.05958   5.044 4.56e-07 ***
## parpol              0.85887    0.09390   9.146  < 2e-16 ***
## protesta            0.20736    0.13216   1.569    0.117    
## percdemPocos_ganan -0.02642    0.05603  -0.471    0.637    
## percdemTodos_colab  0.11444    0.07174   1.595    0.111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10681  on 10201  degrees of freedom
## Residual deviance: 10539  on 10196  degrees of freedom
## AIC: 10551
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR=coef(model2),confint(model2)))
## Waiting for profiling to be done...
##                           OR     2.5 %   97.5 %
## (Intercept)        2.5567441 2.2511936 2.908418
## libpol             1.3505278 1.2009436 1.516918
## parpol             2.3604818 1.9705178 2.848141
## protesta           1.2304217 0.9553360 1.604962
## percdemPocos_ganan 0.9739288 0.8723429 1.086643
## percdemTodos_colab 1.1212413 0.9745584 1.291109

Una manera “bonita” de presentar nuestros modelos es mediante una grafica

sjp.glm(model2,title="Razones de momios del modelo log??stico para Vot?? Vs No Vot??",
           digits=2,sortOdds=F)
## Waiting for profiling to be done...

Ahora vamos a meter mas variables

model3<-glm(voto~libpol+parpol+protesta+percdem+mujer+edad+escol+benpp,
            data=dataf,family=binomial(link = "logit"))
summary(model3)
## 
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + percdem + mujer + 
##     edad + escol + benpp, family = binomial(link = "logit"), 
##     data = dataf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5219   0.4075   0.6204   0.7510   1.1252  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.450951   0.133504  -3.378 0.000731 ***
## libpol              0.259058   0.060581   4.276 1.90e-05 ***
## parpol              0.823433   0.094740   8.691  < 2e-16 ***
## protesta            0.197204   0.133870   1.473 0.140723    
## percdemPocos_ganan -0.062291   0.056954  -1.094 0.274082    
## percdemTodos_colab  0.099865   0.072734   1.373 0.169745    
## mujer               0.105123   0.049637   2.118 0.034189 *  
## edad                0.026102   0.001818  14.355  < 2e-16 ***
## escolPrimaria       0.181087   0.081959   2.209 0.027142 *  
## escolSecundaria     0.319557   0.085031   3.758 0.000171 ***
## escolBachillerato   0.280247   0.087721   3.195 0.001399 ** 
## escolSuperior       0.657244   0.105398   6.236 4.49e-10 ***
## benpp               0.182107   0.053661   3.394 0.000690 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10681  on 10201  degrees of freedom
## Residual deviance: 10274  on 10189  degrees of freedom
## AIC: 10300
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR=coef(model3),confint(model3)))
## Waiting for profiling to be done...
##                           OR     2.5 %    97.5 %
## (Intercept)        0.6370220 0.4903631 0.8276156
## libpol             1.2957083 1.1499343 1.4582251
## parpol             2.2783069 1.8986516 2.7533222
## protesta           1.2179921 0.9423664 1.5938390
## percdemPocos_ganan 0.9396094 0.8400778 1.0502462
## percdemTodos_colab 1.1050219 0.9585731 1.2748951
## mujer              1.1108470 1.0077960 1.2242867
## edad               1.0264457 1.0228112 1.0301281
## escolPrimaria      1.1985192 1.0203062 1.4069988
## escolSecundaria    1.3765185 1.1648418 1.6257620
## escolBachillerato  1.3234568 1.1140952 1.5714103
## escolSuperior      1.9294666 1.5707515 2.3746329
## benpp              1.1997426 1.0803408 1.3332918
sjp.glm(model3,title="Razones de momios del modelo log??stico para Vot?? Vs No Vot??",
        digits=2,sort.est=F)
## Waiting for profiling to be done...