Voy a hacer el tratamiento de la base de datos como yo lo haria para un trabajo final
Usare la pregunta P23 ?? Usted voto en las pasadas elecciones del 1?? de julio de 2012 para diputados federales?
Primero hay que recodificar algunas variables:
setwd("/Users/mau/Dropbox/Cursos mau/Maestria CEI")
data<-read.csv("BaseIP3.csv",header=TRUE)
library(psych)
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.10
## Current Matrix version is 1.2.7.1
## Please re-install 'TMB' from source or restore original 'Matrix' package
## #refugeeswelcome
data$voto<-ifelse(data$p23=="Si voto",1,0)
data$voto<-ifelse(data$p23=="NS/NC",NA,data$voto)
table(data$voto)
##
## 0 1
## 2401 8530
table(data$p21_2)
##
## A veces Muchas veces NS/NC Nunca Pocas veces
## 1275 1764 30 180 539
## Siempre
## 7212
data$libpol<-ifelse(data$p21_2=="Siempre" | data$p21_2=="Muchas veces",1,0)
data$libpol<-ifelse(data$p21_2=="NS/NC",NA,data$libpol)
table(data$libpol)
##
## 0 1
## 1994 8976
table(data$p27_3)
##
## No ha participado Ns/Nc Si ha participado
## 9542 117 1341
data$parpol<-ifelse(data$p27_3=="Si ha participado",1,0)
data$parpol<-ifelse(data$p27_3=="Ns/Nc",NA,data$parpol)
table(data$parpol)
##
## 0 1
## 9542 1341
table(data$p27_7)
##
## No ha participado Ns/Nc Si ha participado
## 10385 119 496
data$protesta<-ifelse(data$p27_7=="Si ha participado",1,0)
data$protesta<-ifelse(data$p27_7=="Ns/Nc",NA,data$protesta)
table(data$protesta)
##
## 0 1
## 10385 496
table(data$p26)
##
## En la democracia las reglas son iguales para todos
## 3026
## En la democracia muchos participan y pocos ganan
## 5362
## En la democracia todos colaboran para lograr un mismo objet
## 2112
## NC
## 185
## NS
## 315
data$percdem<-data$p26
data$percdem<-ifelse(data$p26=="NC",NA,data$percdem)
data$percdem<-ifelse(data$p26=="NS",NA,data$percdem)
table(data$percdem)
##
## 1 2 3
## 3026 5362 2112
data$percdem<-factor(data$percdem,levels=c(1,2,3),labels=c("Mismas_reglas","Pocos_ganan","Todos_colab"))
table(data$percdem)
##
## Mismas_reglas Pocos_ganan Todos_colab
## 3026 5362 2112
data$mujer<-ifelse(data$s1=="Mujer",1,0)
table(data$mujer)
##
## 0 1
## 4858 6142
table(data$s2)
##
## 18 19 20 21 22 23 24 25 26 27 28 29
## 277 255 253 226 241 240 252 226 185 213 241 237
## 30 31 32 33 34 35 36 37 38 39 40 41
## 285 154 228 212 191 204 244 216 248 218 335 135
## 42 43 44 45 46 47 48 49 50 51 52 53
## 299 207 178 280 165 185 274 200 274 147 228 192
## 54 55 56 57 58 59 60 61 62 63 64 65
## 184 151 200 167 177 139 174 95 126 119 85 135
## 66 67 68 69 70 71 72 73 74 75 76 77
## 85 87 110 53 117 45 69 53 61 56 64 36
## 78 79 80 81 82 83 84 85 86 87 88 89
## 39 36 47 15 14 27 23 15 9 12 5 7
## 90 91 92 93 94 96 97 98 NS/NC
## 5 3 2 1 2 2 1 1 1
data$edad<-as.numeric(as.character(data$s2))
## Warning: NAs introducidos por coerci'on
table(data$edad)
##
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 277 255 253 226 241 240 252 226 185 213 241 237 285 154 228 212 191 204
## 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 244 216 248 218 335 135 299 207 178 280 165 185 274 200 274 147 228 192
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
## 184 151 200 167 177 139 174 95 126 119 85 135 85 87 110 53 117 45
## 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 69 53 61 56 64 36 39 36 47 15 14 27 23 15 9 12 5 7
## 90 91 92 93 94 96 97 98
## 5 3 2 1 2 2 1 1
table(data$s3)
##
## Carrera Tecnica NC NS
## 713 21 4
## Ninguno Preparatoria Primaria completa
## 570 1805 1788
## Primaria incompleta Secundaria completa Secundaria incompleta
## 1363 2640 787
## Universidad y mas
## 1309
data$escol<-0
data$escol<-ifelse(data$s3=="Ninguno" | data$s3=="Primaria incompleta",1,data$escol)
data$escol<-ifelse(data$s3=="Primaria completa" | data$s3=="Secundaria incompleta",2,data$escol)
data$escol<-ifelse(data$s3=="Secundaria completa",3,data$escol)
data$escol<-ifelse(data$s3=="Preparatoria" | data$s3=="Carrera Tecnica",4,data$escol)
data$escol<-ifelse(data$s3=="Universidad y mas",5,data$escol)
data$escol<-ifelse(data$s3=="NC" | data$s3=="NS",NA,data$escol)
table(data$escol)
##
## 1 2 3 4 5
## 1933 2575 2640 2518 1309
data$escol<-factor(data$escol,levels=c(1,2,3,4,5),
labels=c("Prim_incom","Primaria","Secundaria","Bachillerato","Superior"))
table(data$escol)
##
## Prim_incom Primaria Secundaria Bachillerato Superior
## 1933 2575 2640 2518 1309
table(data$s11)
##
## NS/NC No Si
## 54 7156 3790
data$benpp<-ifelse(data$s11=="Si",1,0)
data$benpp<-ifelse(data$s11=="NS/NC",NA,data$benpp)
table(data$benpp)
##
## 0 1
## 7156 3790
Supongamos que estamos 100% seguros de que esas seran nuestas variables finales, podemos pasarlas a una nueva base, asi evitamos estar “cargados” de otras variables que pueden estorbar un poco
names<-c("voto","libpol","parpol","protesta","percdem","mujer","edad","escol","benpp")
data2<-data[names]
Veamos como estan nuestras nuevas variables
# truco para saber cuantos NA's tenemos en nuestras variables...
for (Var in names(data2)) {
missing <- sum(is.na(data2[, Var]))
if (missing > 0) {
print(c(Var, missing))
}
}
## [1] "voto" "69"
## [1] "libpol" "30"
## [1] "parpol" "117"
## [1] "protesta" "119"
## [1] "percdem" "500"
## [1] "edad" "1"
## [1] "escol" "25"
## [1] "benpp" "54"
Para quedarme solo con la base con casos validos, uso el comando complete.cases
dataf<-data2[complete.cases(data2),]
# Checando de nuevo los missing
for (Var in names(dataf)) {
missing <- sum(is.na(dataf[, Var]))
if (missing > 0) {
print(c(Var, missing))
}
}
Primero haremos algunos descriptivos,
prop.table(table(dataf$libpol,dataf$voto),1)
##
## 0 1
## 0 0.2624799 0.7375201
## 1 0.2072191 0.7927809
prop.table(table(dataf$parpol,dataf$voto),1)
##
## 0 1
## 0 0.2324748 0.7675252
## 1 0.1108491 0.8891509
prop.table(table(dataf$protesta,dataf$voto),1)
##
## 0 1
## 0 0.2201109 0.7798891
## 1 0.1587983 0.8412017
prop.table(table(dataf$percdem,dataf$voto),1)
##
## 0 1
## Mismas_reglas 0.2189531 0.7810469
## Pocos_ganan 0.2234935 0.7765065
## Todos_colab 0.1990172 0.8009828
prop.table(table(dataf$mujer,dataf$voto),1)
##
## 0 1
## 0 0.2205425 0.7794575
## 1 0.2147616 0.7852384
describeBy(dataf$edad,group=dataf$voto)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2217 38.12 16.67 35 36.48 19.27 18 93 75 0.68 -0.37
## se
## X1 0.35
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 7985 43.78 15.94 42 42.9 17.79 18 98 80 0.42 -0.49
## se
## X1 0.18
prop.table(table(dataf$escol,dataf$voto),1)
##
## 0 1
## Prim_incom 0.1983759 0.8016241
## Primaria 0.2144649 0.7855351
## Secundaria 0.2242424 0.7757576
## Bachillerato 0.2485331 0.7514669
## Superior 0.1746939 0.8253061
prop.table(table(dataf$benpp,dataf$voto),1)
##
## 0 1
## 0 0.2306769 0.7693231
## 1 0.1921447 0.8078553
prop.test(table(dataf$libpol,dataf$voto))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(dataf$libpol, dataf$voto)
## X-squared = 27.017, df = 1, p-value = 2.017e-07
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.03314155 0.07738001
## sample estimates:
## prop 1 prop 2
## 0.2624799 0.2072191
prop.test(table(dataf$parpol,dataf$voto))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(dataf$parpol, dataf$voto)
## X-squared = 96.122, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1018269 0.1414246
## sample estimates:
## prop 1 prop 2
## 0.2324748 0.1108491
prop.test(table(dataf$protesta,dataf$voto))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(dataf$protesta, dataf$voto)
## X-squared = 9.4719, df = 1, p-value = 0.002086
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.02599906 0.09662623
## sample estimates:
## prop 1 prop 2
## 0.2201109 0.1587983
prop.test(table(dataf$percdem,dataf$voto))
##
## 3-sample test for equality of proportions without continuity
## correction
##
## data: table(dataf$percdem, dataf$voto)
## X-squared = 5.2289, df = 2, p-value = 0.07321
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3
## 0.2189531 0.2234935 0.1990172
prop.test(table(dataf$mujer,dataf$voto))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(dataf$mujer, dataf$voto)
## X-squared = 0.46072, df = 1, p-value = 0.4973
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.01055437 0.02211616
## sample estimates:
## prop 1 prop 2
## 0.2205425 0.2147616
t.test(dataf$edad~dataf$voto)
##
## Welch Two Sample t-test
##
## data: dataf$edad by dataf$voto
## t = -14.278, df = 3422.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.437189 -4.882776
## sample estimates:
## mean in group 0 mean in group 1
## 38.11547 43.77545
prop.test(table(dataf$escol,dataf$voto))
##
## 5-sample test for equality of proportions without continuity
## correction
##
## data: table(dataf$escol, dataf$voto)
## X-squared = 31.203, df = 4, p-value = 2.783e-06
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5
## 0.1983759 0.2144649 0.2242424 0.2485331 0.1746939
prop.test(table(dataf$benpp,dataf$voto))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(dataf$benpp, dataf$voto)
## X-squared = 19.95, df = 1, p-value = 7.948e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.02185968 0.05520472
## sample estimates:
## prop 1 prop 2
## 0.2306769 0.1921447
cuadro
Ahora si, podemos hacer un modelo logistico Siempre me gusta empezar con un modelo nulo…
model0<-glm(voto~1,data=dataf,family=binomial)
summary(model0)
##
## Call:
## glm(formula = voto ~ 1, family = binomial, data = dataf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.747 0.700 0.700 0.700 0.700
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.28141 0.02401 53.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10681 on 10201 degrees of freedom
## Residual deviance: 10681 on 10201 degrees of freedom
## AIC: 10683
##
## Number of Fisher Scoring iterations: 4
Ahora vamos a meter nuestras variables independientes
model1<-glm(voto~libpol+parpol+protesta,data=dataf,family=binomial)
summary(model1)
##
## Call:
## glm(formula = voto ~ libpol + parpol + protesta, family = binomial,
## data = dataf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1951 0.4791 0.7104 0.7104 0.8105
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.94472 0.05360 17.624 < 2e-16 ***
## libpol 0.30348 0.05954 5.097 3.45e-07 ***
## parpol 0.85865 0.09388 9.146 < 2e-16 ***
## protesta 0.20822 0.13210 1.576 0.115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10681 on 10201 degrees of freedom
## Residual deviance: 10543 on 10198 degrees of freedom
## AIC: 10551
##
## Number of Fisher Scoring iterations: 4
?? Que nos dice este modelo? ?? Como se interprtan esos valores?
A mi me gusta pensar en razones de momios… odds ratio … Entonces
exp(coef(model1))
## (Intercept) libpol parpol protesta
## 2.572098 1.354569 2.359974 1.231481
exp(cbind(OR=coef(model1),confint(model1)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 2.572098 2.3173679 2.859389
## libpol 1.354569 1.2046166 1.521359
## parpol 2.359974 1.9701942 2.847389
## protesta 1.231481 0.9562589 1.606187
Vamos a meter otra variable En este caso la variable percepcion democracia es nominal con mas de dos categorias, como ya la definimos como “factor” en R, no seria necesario usar la instruccion en el veamos…
model2<-glm(voto~libpol+parpol+protesta+factor(percdem),data=dataf,family=binomial)
summary(model2)
##
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + factor(percdem),
## family = binomial, data = dataf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2383 0.4811 0.7132 0.7216 0.8217
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.93873 0.06533 14.368 < 2e-16 ***
## libpol 0.30050 0.05958 5.044 4.56e-07 ***
## parpol 0.85887 0.09390 9.146 < 2e-16 ***
## protesta 0.20736 0.13216 1.569 0.117
## factor(percdem)Pocos_ganan -0.02642 0.05603 -0.471 0.637
## factor(percdem)Todos_colab 0.11444 0.07174 1.595 0.111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10681 on 10201 degrees of freedom
## Residual deviance: 10539 on 10196 degrees of freedom
## AIC: 10551
##
## Number of Fisher Scoring iterations: 4
model2<-glm(voto~libpol+parpol+protesta+percdem,data=dataf,family=binomial)
summary(model2)
##
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + percdem, family = binomial,
## data = dataf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2383 0.4811 0.7132 0.7216 0.8217
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.93873 0.06533 14.368 < 2e-16 ***
## libpol 0.30050 0.05958 5.044 4.56e-07 ***
## parpol 0.85887 0.09390 9.146 < 2e-16 ***
## protesta 0.20736 0.13216 1.569 0.117
## percdemPocos_ganan -0.02642 0.05603 -0.471 0.637
## percdemTodos_colab 0.11444 0.07174 1.595 0.111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10681 on 10201 degrees of freedom
## Residual deviance: 10539 on 10196 degrees of freedom
## AIC: 10551
##
## Number of Fisher Scoring iterations: 4
exp(cbind(OR=coef(model2),confint(model2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 2.5567441 2.2511936 2.908418
## libpol 1.3505278 1.2009436 1.516918
## parpol 2.3604818 1.9705178 2.848141
## protesta 1.2304217 0.9553360 1.604962
## percdemPocos_ganan 0.9739288 0.8723429 1.086643
## percdemTodos_colab 1.1212413 0.9745584 1.291109
Una manera “bonita” de presentar nuestros modelos es mediante una grafica
sjp.glm(model2,title="Razones de momios del modelo log??stico para Vot?? Vs No Vot??",
digits=2,sortOdds=F)
## Waiting for profiling to be done...
Ahora vamos a meter mas variables
model3<-glm(voto~libpol+parpol+protesta+percdem+mujer+edad+escol+benpp,
data=dataf,family=binomial(link = "logit"))
summary(model3)
##
## Call:
## glm(formula = voto ~ libpol + parpol + protesta + percdem + mujer +
## edad + escol + benpp, family = binomial(link = "logit"),
## data = dataf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5219 0.4075 0.6204 0.7510 1.1252
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.450951 0.133504 -3.378 0.000731 ***
## libpol 0.259058 0.060581 4.276 1.90e-05 ***
## parpol 0.823433 0.094740 8.691 < 2e-16 ***
## protesta 0.197204 0.133870 1.473 0.140723
## percdemPocos_ganan -0.062291 0.056954 -1.094 0.274082
## percdemTodos_colab 0.099865 0.072734 1.373 0.169745
## mujer 0.105123 0.049637 2.118 0.034189 *
## edad 0.026102 0.001818 14.355 < 2e-16 ***
## escolPrimaria 0.181087 0.081959 2.209 0.027142 *
## escolSecundaria 0.319557 0.085031 3.758 0.000171 ***
## escolBachillerato 0.280247 0.087721 3.195 0.001399 **
## escolSuperior 0.657244 0.105398 6.236 4.49e-10 ***
## benpp 0.182107 0.053661 3.394 0.000690 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10681 on 10201 degrees of freedom
## Residual deviance: 10274 on 10189 degrees of freedom
## AIC: 10300
##
## Number of Fisher Scoring iterations: 4
exp(cbind(OR=coef(model3),confint(model3)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.6370220 0.4903631 0.8276156
## libpol 1.2957083 1.1499343 1.4582251
## parpol 2.2783069 1.8986516 2.7533222
## protesta 1.2179921 0.9423664 1.5938390
## percdemPocos_ganan 0.9396094 0.8400778 1.0502462
## percdemTodos_colab 1.1050219 0.9585731 1.2748951
## mujer 1.1108470 1.0077960 1.2242867
## edad 1.0264457 1.0228112 1.0301281
## escolPrimaria 1.1985192 1.0203062 1.4069988
## escolSecundaria 1.3765185 1.1648418 1.6257620
## escolBachillerato 1.3234568 1.1140952 1.5714103
## escolSuperior 1.9294666 1.5707515 2.3746329
## benpp 1.1997426 1.0803408 1.3332918
sjp.glm(model3,title="Razones de momios del modelo log??stico para Vot?? Vs No Vot??",
digits=2,sort.est=F)
## Waiting for profiling to be done...