library(data.table)
library(margins)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(rpart)
library(rpart.plot)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)
Estimaos modelo que depende tanto de caracteríticas de los alumnos/hogares como de las escuelas postuladas:
f01 <- formula(dummy_dependencia~es_mujer+soc_dim+subway_student_distance+crimen_sobre_mediana+copago)
reg_lm <- lm(f01,data=base)
summary(reg_lm)
##
## Call:
## lm(formula = f01, data = base)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0914 -0.1223 0.1442 0.2671 0.5486
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.08340029585 0.00849709149 127.502
## es_mujer 0.00982727903 0.00261303821 3.761
## soc_dim -0.76552978570 0.01393670430 -54.929
## subway_student_distance 0.02235223219 0.00062684915 35.658
## crimen_sobre_mediana -0.00362479029 0.00263380539 -1.376
## copago 0.00000543085 0.00000004684 115.940
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## es_mujer 0.000169 ***
## soc_dim < 0.0000000000000002 ***
## subway_student_distance < 0.0000000000000002 ***
## crimen_sobre_mediana 0.168746
## copago < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3711 on 80719 degrees of freedom
## Multiple R-squared: 0.1616, Adjusted R-squared: 0.1615
## F-statistic: 3111 on 5 and 80719 DF, p-value: < 0.00000000000000022
reg_lg <- glm(f01,data=base,family = binomial(link = "logit"))
summary(reg_lg)
##
## Call:
## glm(formula = f01, family = binomial(link = "logit"), data = base)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.71016 0.00060 0.05485 0.77425 1.60287
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.307419374 0.063562864 52.034 <0.0000000000000002
## es_mujer 0.059683936 0.019578999 3.048 0.0023
## soc_dim -5.172840290 0.102596843 -50.419 <0.0000000000000002
## subway_student_distance 0.171996250 0.005684831 30.255 <0.0000000000000002
## crimen_sobre_mediana 0.042793933 0.019773897 2.164 0.0305
## copago 0.000187169 0.000005923 31.599 <0.0000000000000002
##
## (Intercept) ***
## es_mujer **
## soc_dim ***
## subway_student_distance ***
## crimen_sobre_mediana *
## copago ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82370 on 80724 degrees of freedom
## Residual deviance: 61500 on 80719 degrees of freedom
## AIC: 61512
##
## Number of Fisher Scoring iterations: 10
margins(reg_lg)
## Average marginal effects
## glm(formula = f01, family = binomial(link = "logit"), data = base)
## es_mujer soc_dim subway_student_distance crimen_sobre_mediana copago
## 0.007727 -0.6697 0.02227 0.00554 0.00002423
base$fit_lineal <- fitted(reg_lm,base,'probabilities')
base$fit_log <- fitted(reg_lg,base,'probabilities')
data.table(RMSE=RMSE(base$fit_lineal,base$dummy_dependencia),
MAE=MAE(base$fit_lineal,base$dummy_dependencia))
## RMSE MAE
## 1: 0.371102 0.3005324
data.table(RMSE=RMSE(base$fit_log,base$dummy_dependencia),
MAE=MAE(base$fit_log,base$dummy_dependencia))
## RMSE MAE
## 1: 0.3594941 0.2587036
set.seed(1234)
setupKCV <- trainControl(method = 'cv',number = 5, p=0.7)
train(f01,data=base,method='lm',trControl=setupKCV)
## Linear Regression
##
## 80725 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 64580, 64580, 64580, 64580, 64580
## Resampling results:
##
## RMSE Rsquared MAE
## 0.3711335 0.161433 0.3005676
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
train(f01,data=base,method='glm',trControl=setupKCV)
## Generalized Linear Model
##
## 80725 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 64580, 64580, 64580, 64580, 64580
## Resampling results:
##
## RMSE Rsquared MAE
## 0.3711254 0.1615277 0.3005544
base[,Prioritario:=ifelse(prioritario==1,'Prioritario','No prioritario')]
arbol <- rpart(Dependencia ~ copago,data=base,method = 'class')
rpart.plot(arbol)
table(base$Dependencia)/nrow(base)
##
## PARTICULAR SUBVENCIONADO PUBLICO
## 0.7928275 0.2071725
quantile(base$copago,probs = seq(0,1,0.1))
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00 29169.80
## 80% 90% 100%
## 48000.00 67284.29 116000.00
Creamos base de muestra (muy pesada para graficar entera) y gaficamos distribución espacial:
samp <- students[sample(.N,200),.(seg_dim,soc_dim)]
ggplot(samp) +
geom_point(aes(x=seg_dim,y=soc_dim))
cluster <- kmeans(x = samp,centers = 3,nstart = 50)
fviz_cluster(cluster,data=samp,geom = "point")