Paquetes

library(data.table)
library(margins)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(rpart)
library(rpart.plot)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)

Bases a utilizar:

Unimos bases

Regresión de probabilidad lineal

Estimaos modelo que depende tanto de caracteríticas de los alumnos/hogares como de las escuelas postuladas:

f01 <- formula(dummy_dependencia~es_mujer+soc_dim+subway_student_distance+crimen_sobre_mediana+copago)
reg_lm <- lm(f01,data=base)
summary(reg_lm)
## 
## Call:
## lm(formula = f01, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0914 -0.1223  0.1442  0.2671  0.5486 
## 
## Coefficients:
##                               Estimate     Std. Error t value
## (Intercept)              1.08340029585  0.00849709149 127.502
## es_mujer                 0.00982727903  0.00261303821   3.761
## soc_dim                 -0.76552978570  0.01393670430 -54.929
## subway_student_distance  0.02235223219  0.00062684915  35.658
## crimen_sobre_mediana    -0.00362479029  0.00263380539  -1.376
## copago                   0.00000543085  0.00000004684 115.940
##                                     Pr(>|t|)    
## (Intercept)             < 0.0000000000000002 ***
## es_mujer                            0.000169 ***
## soc_dim                 < 0.0000000000000002 ***
## subway_student_distance < 0.0000000000000002 ***
## crimen_sobre_mediana                0.168746    
## copago                  < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3711 on 80719 degrees of freedom
## Multiple R-squared:  0.1616, Adjusted R-squared:  0.1615 
## F-statistic:  3111 on 5 and 80719 DF,  p-value: < 0.00000000000000022

Regesión logistica

reg_lg <- glm(f01,data=base,family = binomial(link = "logit"))
summary(reg_lg)
## 
## Call:
## glm(formula = f01, family = binomial(link = "logit"), data = base)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.71016   0.00060   0.05485   0.77425   1.60287  
## 
## Coefficients:
##                             Estimate   Std. Error z value            Pr(>|z|)
## (Intercept)              3.307419374  0.063562864  52.034 <0.0000000000000002
## es_mujer                 0.059683936  0.019578999   3.048              0.0023
## soc_dim                 -5.172840290  0.102596843 -50.419 <0.0000000000000002
## subway_student_distance  0.171996250  0.005684831  30.255 <0.0000000000000002
## crimen_sobre_mediana     0.042793933  0.019773897   2.164              0.0305
## copago                   0.000187169  0.000005923  31.599 <0.0000000000000002
##                            
## (Intercept)             ***
## es_mujer                ** 
## soc_dim                 ***
## subway_student_distance ***
## crimen_sobre_mediana    *  
## copago                  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82370  on 80724  degrees of freedom
## Residual deviance: 61500  on 80719  degrees of freedom
## AIC: 61512
## 
## Number of Fisher Scoring iterations: 10
margins(reg_lg)
## Average marginal effects
## glm(formula = f01, family = binomial(link = "logit"), data = base)
##  es_mujer soc_dim subway_student_distance crimen_sobre_mediana     copago
##  0.007727 -0.6697                 0.02227              0.00554 0.00002423

Predecimos resultados

base$fit_lineal <- fitted(reg_lm,base,'probabilities')
base$fit_log <- fitted(reg_lg,base,'probabilities')

Error cuadratico medio modelo lineal

data.table(RMSE=RMSE(base$fit_lineal,base$dummy_dependencia),
           MAE=MAE(base$fit_lineal,base$dummy_dependencia))
##        RMSE       MAE
## 1: 0.371102 0.3005324

Error cuadratico medio modelo logistico

data.table(RMSE=RMSE(base$fit_log,base$dummy_dependencia),
           MAE=MAE(base$fit_log,base$dummy_dependencia))
##         RMSE       MAE
## 1: 0.3594941 0.2587036

Cross validation con K-fols

set.seed(1234)

setupKCV <- trainControl(method = 'cv',number = 5, p=0.7)

Modelo lineal

train(f01,data=base,method='lm',trControl=setupKCV)
## Linear Regression 
## 
## 80725 samples
##     5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 64580, 64580, 64580, 64580, 64580 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.3711335  0.161433  0.3005676
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Modelo logistico

train(f01,data=base,method='glm',trControl=setupKCV)
## Generalized Linear Model 
## 
## 80725 samples
##     5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 64580, 64580, 64580, 64580, 64580 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.3711254  0.1615277  0.3005544

Arboles de decisión

base[,Prioritario:=ifelse(prioritario==1,'Prioritario','No prioritario')]
arbol <- rpart(Dependencia ~ copago,data=base,method = 'class')
rpart.plot(arbol)

table(base$Dependencia)/nrow(base)
## 
## PARTICULAR SUBVENCIONADO                  PUBLICO 
##                0.7928275                0.2071725
quantile(base$copago,probs = seq(0,1,0.1))
##        0%       10%       20%       30%       40%       50%       60%       70% 
##      0.00      0.00      0.00      0.00      0.00      0.00      0.00  29169.80 
##       80%       90%      100% 
##  48000.00  67284.29 116000.00

Cluster

Creamos base de muestra (muy pesada para graficar entera) y gaficamos distribución espacial:

samp <- students[sample(.N,200),.(seg_dim,soc_dim)]
ggplot(samp) +
  geom_point(aes(x=seg_dim,y=soc_dim))

cluster <- kmeans(x = samp,centers = 3,nstart = 50)
fviz_cluster(cluster,data=samp,geom = "point")