Un Modelo Logístico se puede expresar de la siguiente manera:
\[y_{i} \sim Bernoulli(\theta_{i})\] \[log(\frac{\theta_{i}}{1-\theta_{i}}) = X_{i}^{'}\beta\] El parámetro \(\theta_{i}\) de la distribución de la variable dependiente \(y_{i}\) se puede enlazar con una combinación lineal de variables predictoras \(X_{i}\) a traves de la función \(log(\frac{\theta_{i}}{1-\theta_{i}})\) denominada Logit.
El objetivo del modelo es estimar el parámetro \(\theta_{i}\) mediante un vector de parámetros \(\beta\) de la combinación lineal \(X_{i}^{'}\beta\) que minimize el error de estimación. Existen varios métodos de estimación y es lo que se tratará en el presente trabajo.
Se utiliza la dataset “Titanic” que puede ser descargado de Titanic-Dataset (train.csv).
El dataset “Titanic” contine los siguientes atributos:
# ==============================================================================
# Liberias
# ==============================================================================
#install.packages("caret")
#install.packages("devtools")
#install.packages("Metrics")
#install.packages('Amelia')
library("caret")
library("statisticalModeling")
library("Metrics")
library("Amelia")
library("ROCR")
#===============================================================================
# Importando los datos
#===============================================================================
# Eliminar los objetos de Global Environment
rm(list = ls())
ls_ruta = "D:/Doctorado/DCIES/02.Ciclo/DCIES-241 Estadística aplicada/sesion06/Laboratorio2/Dataset"
setwd(ls_ruta)
df_data= as.data.frame(read.csv("train.csv"))
head(df_data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
Mostrando los valores perdidos o incompletos
# Descartando variables: PassengerId, Name, Ticket, Cabin
df_titanic = df_data[ ,!(names(df_data) %in% c("PassengerId","Name","Ticket","Cabin"))]
missmap(df_titanic, main = "Valores faltantes vs Observados")
Depurado atributos, completando datos, creando variables dummy y factores:
# Depurando los valores nulos
colSums(is.na(df_titanic))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 177 0 0 0 0
# Completando la edad promedio a los valores nulos
li_edad = floor(mean(df_titanic$Age[!is.na(df_titanic$Age)]))
df_titanic$Age[is.na(df_titanic$Age)] = li_edad
colSums(is.na(df_titanic))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
# Transformando las variables categorica a factor
df_titanic$Sex = factor(df_titanic$Sex)
summary(df_titanic$Sex)
## female male
## 314 577
df_titanic$Sex = model.matrix(~df_titanic$Sex)[,2]
summary(df_titanic$Sex)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.6476 1.0000 1.0000
df_titanic$Embarked = factor(df_titanic$Embarked)
summary(df_titanic$Embarked)
## C Q S
## 2 168 77 644
df_titanic$EmbarkedC = model.matrix(~df_titanic$Embarked)[,2]
df_titanic$EmbarkedQ = model.matrix(~df_titanic$Embarked)[,3]
df_titanic$EmbarkedS = model.matrix(~df_titanic$Embarked)[,4]
# Descartando variables: Embarked
df_titanic = df_titanic[ ,!(names(df_titanic) %in% c("Embarked"))]
missmap(df_titanic, main = "Valores faltantes vs Observados")
Consideraremos 80% de los datos para entrenamiento y 20% restante de prueba para evaluar los modelos propuestos. Los datos de entrenamiento y pruebas se selecionan aleatoriamente.
head(df_titanic,10)
## Survived Pclass Sex Age SibSp Parch Fare EmbarkedC EmbarkedQ EmbarkedS
## 1 0 3 1 22 1 0 7.2500 0 0 1
## 2 1 1 0 38 1 0 71.2833 1 0 0
## 3 1 3 0 26 0 0 7.9250 0 0 1
## 4 1 1 0 35 1 0 53.1000 0 0 1
## 5 0 3 1 35 0 0 8.0500 0 0 1
## 6 0 3 1 29 0 0 8.4583 0 1 0
## 7 0 1 1 54 0 0 51.8625 0 0 1
## 8 0 3 1 2 3 1 21.0750 0 0 1
## 9 1 3 0 27 0 2 11.1333 0 0 1
## 10 1 2 0 14 1 0 30.0708 1 0 0
split = 0.8
train = sort(sample(nrow(df_titanic),floor(nrow(df_titanic)*split)))
df_titanic_train = df_titanic[train,]
df_titanic_test = df_titanic[-train,]
Se propone y estima 5 modelos con los datos entrenamiento
df_titanic_train
y luego se evalúa su capacidad predictiva
utilizando los datos de pruebas df_titanic_test
.
#===============================================================================
# Modelo Logistico utilizando librería glm
#===============================================================================
glm_titanic_1 = glm(Survived~Pclass+Sex+Age+SibSp+Parch+
Fare+EmbarkedC+EmbarkedQ+ EmbarkedS,
data =df_titanic_train,family=binomial(link = "logit"))
glm_titanic_2 = glm(Survived~Pclass+Sex+Age+SibSp+Parch+Fare,
data =df_titanic_train,family=binomial(link = "logit"))
glm_titanic_3 = glm(Survived~Pclass+Sex+Age+SibSp+Parch,
data =df_titanic_train,family=binomial(link = "logit"))
glm_titanic_4 = glm(Survived~Pclass+Sex+Age+SibSp,
data =df_titanic_train,family=binomial(link = "logit"))
glm_titanic_5 = glm(Survived~Pclass+Sex+Age,
data =df_titanic_train,family=binomial(link = "logit"))
Función R para evaluar los modelos mediante indicadores Accuracy y MASE. Asimismo calcula el corte optimo para la curva ROC.
#===============================================================================
# Indicadores del Modelo Logistico
#===============================================================================
fEvaluaModelo = function(glm_modelo,df_evaluar){
# Corte Optimo
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(summary(glm_modelo))
df_evaluar$ProbSurvived = predict(glm_modelo,df_evaluar, ,type = "response")
# Curva ROC
xpred = prediction(predictions = df_evaluar$ProbSurvived ,
labels = df_evaluar$Survived )
xroc.perf = performance(xpred, "tpr", "fpr")
# Indicadores
xcut = opt.cut(xroc.perf, xpred)[3]
df_evaluar$PSurvived = ifelse(df_evaluar$ProbSurvived < (1 - xcut),0,1)
xmc = confusionMatrix(as.factor(df_evaluar$PSurvived),
as.factor(df_evaluar$Survived))
xmatrizconfusion = xmc[2]$table
xaccuracy = xmc[3]$overall[1]*100
xmase = mase(actual = df_evaluar$Survived,
predicted = df_evaluar$PSurvived)
print("Indicadores: ")
cat("\nAccuracy = ", xaccuracy,
"\nOptime Cut = ",xcut,"\nMASE = ", xmase )
# Graficar Curva ROC
plot(xroc.perf,
main="Curva ROC para test \n Modelo logístico",
col ="blue",
lwd=2)
abline(0, 1, col="red",lwd = 2, lty = 2)
}
Evaluación del modelo 1: Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+EmbarkedC+EmbarkedQ + EmbarkedS
fEvaluaModelo(glm_titanic_1,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Fare + EmbarkedC + EmbarkedQ + EmbarkedS, family = binomial(link = "logit"),
## data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6680 -0.5951 -0.4173 0.6505 2.4234
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.118742 535.411440 0.030 0.97598
## Pclass -1.060842 0.162965 -6.510 7.53e-11 ***
## Sex -2.665384 0.223379 -11.932 < 2e-16 ***
## Age -0.043967 0.009046 -4.860 1.17e-06 ***
## SibSp -0.363316 0.119538 -3.039 0.00237 **
## Parch -0.089789 0.135025 -0.665 0.50607
## Fare 0.002236 0.002997 0.746 0.45559
## EmbarkedC -10.684411 535.411270 -0.020 0.98408
## EmbarkedQ -10.828391 535.411345 -0.020 0.98386
## EmbarkedS -11.235874 535.411244 -0.021 0.98326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 635.97 on 702 degrees of freedom
## AIC: 655.97
##
## Number of Fisher Scoring iterations: 12
##
## [1] "Indicadores: "
##
## Accuracy = 81.56425
## Optime Cut = 0.3938731
## MASE = 0.3528564
Evaluación del modelo 2: Survived~Pclass+Sex+Age+SibSp+Parch+Fare
fEvaluaModelo(glm_titanic_2,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Fare, family = binomial(link = "logit"), data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7598 -0.6095 -0.4248 0.6318 2.4044
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.012999 0.607119 8.257 < 2e-16 ***
## Pclass -1.050991 0.157749 -6.662 2.69e-11 ***
## Sex -2.699249 0.220711 -12.230 < 2e-16 ***
## Age -0.044174 0.009005 -4.906 9.31e-07 ***
## SibSp -0.395742 0.119283 -3.318 0.000908 ***
## Parch -0.101122 0.133023 -0.760 0.447146
## Fare 0.003367 0.002988 1.127 0.259780
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 641.18 on 705 degrees of freedom
## AIC: 655.18
##
## Number of Fisher Scoring iterations: 5
##
## [1] "Indicadores: "
##
## Accuracy = 82.68156
## Optime Cut = 0.3725099
## MASE = 0.3314711
Evaluación del modelo 3: Survived~Pclass+Sex+Age+SibSp+Parch
fEvaluaModelo(glm_titanic_3,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch,
## family = binomial(link = "logit"), data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6822 -0.6124 -0.4193 0.6314 2.4160
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.326138 0.546768 9.741 < 2e-16 ***
## Pclass -1.148079 0.134278 -8.550 < 2e-16 ***
## Sex -2.705593 0.220404 -12.276 < 2e-16 ***
## Age -0.044666 0.008989 -4.969 6.73e-07 ***
## SibSp -0.371975 0.116990 -3.180 0.00147 **
## Parch -0.073785 0.131187 -0.562 0.57382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 642.62 on 706 degrees of freedom
## AIC: 654.62
##
## Number of Fisher Scoring iterations: 5
##
## [1] "Indicadores: "
##
## Accuracy = 82.68156
## Optime Cut = 0.3825597
## MASE = 0.3314711
Evaluación del modelo 4: Survived~Pclass+Sex+Age+SibSp
fEvaluaModelo(glm_titanic_4,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = binomial(link = "logit"),
## data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7146 -0.6083 -0.4185 0.6312 2.4361
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.292151 0.543673 9.734 < 2e-16 ***
## Pclass -1.150491 0.134316 -8.566 < 2e-16 ***
## Sex -2.676648 0.213779 -12.521 < 2e-16 ***
## Age -0.044540 0.008982 -4.959 7.09e-07 ***
## SibSp -0.393461 0.111196 -3.538 0.000402 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 642.94 on 707 degrees of freedom
## AIC: 652.94
##
## Number of Fisher Scoring iterations: 5
##
## [1] "Indicadores: "
##
## Accuracy = 82.12291
## Optime Cut = 0.3928708
## MASE = 0.3421638
Evaluación del modelo 5: Survived~Pclass+Sex+Age
fEvaluaModelo(glm_titanic_5,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age, family = binomial(link = "logit"),
## data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6575 -0.6740 -0.4428 0.6666 2.3967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.710092 0.503712 9.351 < 2e-16 ***
## Pclass -1.136504 0.132733 -8.562 < 2e-16 ***
## Sex -2.527407 0.204356 -12.368 < 2e-16 ***
## Age -0.036071 0.008386 -4.301 1.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 658.49 on 708 degrees of freedom
## AIC: 666.49
##
## Number of Fisher Scoring iterations: 4
##
## [1] "Indicadores: "
##
## Accuracy = 82.68156
## Optime Cut = 0.4021123
## MASE = 0.3314711
Se seleciona el modelo 4: Survived~Pclass+Sex+Age+SibSp por los siguientes criterios:
El siguente cuadro muestra el vector de parámetros \(\beta\) y los indicadores Accuracy y MASE:
fEvaluaModelo(glm_titanic_4,df_titanic_test)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = binomial(link = "logit"),
## data = df_titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7146 -0.6083 -0.4185 0.6312 2.4361
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.292151 0.543673 9.734 < 2e-16 ***
## Pclass -1.150491 0.134316 -8.566 < 2e-16 ***
## Sex -2.676648 0.213779 -12.521 < 2e-16 ***
## Age -0.044540 0.008982 -4.959 7.09e-07 ***
## SibSp -0.393461 0.111196 -3.538 0.000402 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 952.58 on 711 degrees of freedom
## Residual deviance: 642.94 on 707 degrees of freedom
## AIC: 652.94
##
## Number of Fisher Scoring iterations: 5
##
## [1] "Indicadores: "
##
## Accuracy = 82.12291
## Optime Cut = 0.3928708
## MASE = 0.3421638
Se procederá a estimar el vector de parámetros \(\beta\) del modelo 4: Survived~Pclass+Sex+Age+SibSp por los siguientes métodos:
Finalmente se presentará un cuadro comparativo de los 5 métodos.
Preparado el cuadro comparativo y la matriz de datos:
# ==============================================================================
# Modelo seleccionado
# ==============================================================================
df_cuadro_comparacion = data.frame(Variable = c("(Intercept)","Pclass","Sex",
"Age","SibSp"),
"R.glm" = c(0,0,0,0,0),
"R.optim" = c(0,0,0,0,0),
"Scoring.Fisher" = c(0,0,0,0,0),
"IRLS" = c(0,0,0,0,0),
"Metropolis.Hasting" = c(0,0,0,0,0))
# Preparando las matriz de datos
df_titanic_train$Intercept = 1
Y = as.matrix(df_titanic_train$Survived)
XX = as.matrix(df_titanic_train[,c("Intercept","Pclass","Sex","Age","SibSp")])
colnames(XX) = NULL
La función glm de R estima el vector de parámetros \(\beta\)
# ==============================================================================
# Método 1: Regresión utilizando librería glm de R
# ==============================================================================
glm_titanic_4 = glm(Survived~Pclass+Sex+Age+SibSp,
data =df_titanic_train,family=binomial(link = "logit"))
b1 = as.numeric(coefficients(glm_titanic_4)[1])
b2 = as.numeric(coefficients(glm_titanic_4)[2])
b3 = as.numeric(coefficients(glm_titanic_4)[3])
b4 = as.numeric(coefficients(glm_titanic_4)[4])
b5 = as.numeric(coefficients(glm_titanic_4)[5])
df_cuadro_comparacion[,"R.glm"] = c(b1,b2,b3,b4,b5)
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS Metropolis.Hasting
## 1 (Intercept) 5.29215150 0 0 0 0
## 2 Pclass -1.15049134 0 0 0 0
## 3 Sex -2.67664817 0 0 0 0
## 4 Age -0.04454008 0 0 0 0
## 5 SibSp -0.39346077 0 0 0 0
La función Optim de R optimiza (maximiza) la función log verosimilitud del modelo, el resultado es un vector de parámetros \(\beta\)
# ==============================================================================
# Método 2: Optimización Verosimilitud con función Optim de R
# ==============================================================================
#------------------------------------------------------------------------------
# Funcion de LogVerosimilitud
#------------------------------------------------------------------------------
fLogVerosimilitud = function (B,Y,XX) {
nrow = nrow(Y)
prow = nrow(B)
lvs = 0.0
for (i in 1:nrow) {
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
lvs = lvs + Yi*t(Xi)%*%B - log(1 + exp(t(Xi)%*%B))
}
return(lvs)
}
#------------------------------------------------------------------------------
# Funcion de LogVerosimilitud
#------------------------------------------------------------------------------
fNegativoLogVerosimilitud= function(B,Y,XX) {
return( - fLogVerosimilitud (B,Y,XX))
}
Bo = matrix(0, nrow = ncol(XX), ncol = 1)
r = optim( Bo , fNegativoLogVerosimilitud, Y = Y,XX = XX)
b1 = r$par[1]
b2 = r$par[2]
b3 = r$par[3]
b4 = r$par[4]
b5 = r$par[5]
df_cuadro_comparacion[,"R.optim"] = c(b1,b2,b3,b4,b5)
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS Metropolis.Hasting
## 1 (Intercept) 5.29215150 5.30700031 0 0 0
## 2 Pclass -1.15049134 -1.12496995 0 0 0
## 3 Sex -2.67664817 -2.80118718 0 0 0
## 4 Age -0.04454008 -0.04383097 0 0 0
## 5 SibSp -0.39346077 -0.41496568 0 0 0
Se utiliza el algoritmo Scoring-Fisher, se requiere conocer la función log verosimilitud, la Gradiente y el Hessiano. El cuadro de resultados muestra como el vector \(\beta\) itera hasta llegar al optimo de la función Log Verosimilitud.
Se utiliza la norma2 como criterio para evaluar la convergencia del vector que optimiza la función Log Verosimilitud.
# ==============================================================================
# Método 3: Scoring-Fisher
# ==============================================================================
#------------------------------------------------------------------------------
# Funcion calcula la norma de la diferencia de vectores
#------------------------------------------------------------------------------
fNorma = function (A,B){
return (sqrt(as.numeric(t(A- B)%*%(A- B))) )
}
#------------------------------------------------------------------------------
# Funcion de Gradiente LogVerosimilitud
#------------------------------------------------------------------------------
fGLogVerosimilitud = function (B,Y,XX) {
nrow = nrow(Y)
prow = nrow(B)
G = matrix(0, nrow = prow, ncol = 1)
for (i in 1:nrow) {
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
K = as.numeric((exp(t(Xi)%*%B))/((1 + exp(t(Xi)%*%B))))
G = G + Yi*Xi - K*Xi
}
return(G)
}
#------------------------------------------------------------------------------
# Funcion de Hessiano LogVerosimilitud
#------------------------------------------------------------------------------
fHLogVerosimilitud = function (B,Y,XX) {
nrow = nrow(Y)
prow = nrow(B)
H = matrix(0, nrow = prow, ncol = prow)
for (i in 1:nrow) {
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
K = as.numeric((exp(t(Xi)%*%B))/((1 + exp(t(Xi)%*%B))^2))
H = H + K*Xi%*%t(Xi)
}
return(-H)
}
#------------------------------------------------------------------------------
# Funcion Informacion de Fisher
#------------------------------------------------------------------------------
fInformacionFisher = function (B,Y,XX) {
return(-fHLogVerosimilitud(B,Y,XX))
}
#------------------------------------------------------------------------------
# Método 2: Scoring-Fisher
#------------------------------------------------------------------------------
B0 = matrix(0, nrow = ncol(XX), ncol = 1)
B1 = B0
epsilon = 0.000000001
i = 1
df_resultado=
data.frame (Iteracion = i,
Estimacion = paste(paste("(" ,toString(round(B1,6) )),")"),
Control = c(''),
LogVerosimilitud = fLogVerosimilitud (B1,Y,XX),
Gradiente = paste(paste("(" ,toString(round(fGLogVerosimilitud (B1,Y,XX),3) )),")"))
while (( fNorma ( B1,B0) >= epsilon) | (i == 1)){
B0 = B1
i = i + 1
B1 = B0 + solve(fInformacionFisher (B0,Y,XX))%*%fGLogVerosimilitud (B0,Y,XX)
df_iteracion =
data.frame(Iteracion = i,
Estimacion = paste(paste("(" ,toString(round(B1,6) )),")"),
Control = fNorma ( B1,B0),
LogVerosimilitud = fLogVerosimilitud (B1,Y,XX),
Gradiente = paste(paste("(" ,toString(round(fGLogVerosimilitud (B1,Y,XX),2) )),")"))
df_resultado = rbind(df_resultado, df_iteracion)
}
print(df_resultado)
## Iteracion Estimacion
## 1 1 ( 0, 0, 0, 0, 0 )
## 2 2 ( 3.44567, -0.729307, -2.015601, -0.026657, -0.219065 )
## 3 3 ( 4.905973, -1.064244, -2.541895, -0.040734, -0.349482 )
## 4 4 ( 5.272813, -1.14628, -2.669792, -0.04435, -0.390766 )
## 5 5 ( 5.292099, -1.15048, -2.676629, -0.04454, -0.393452 )
## 6 6 ( 5.292151, -1.150491, -2.676648, -0.04454, -0.393461 )
## 7 7 ( 5.292151, -1.150491, -2.676648, -0.04454, -0.393461 )
## Control LogVerosimilitud
## 1 -493.5208
## 2 4.06397340202227 -331.8679
## 3 1.59337987574198 -321.8618
## 4 0.39922004109232 -321.4687
## 5 0.0210617538163169 -321.4677
## 6 5.72824785423957e-05 -321.4677
## 7 4.5552685002082e-10 -321.4677
## Gradiente
## 1 ( -78, -272, -139, -2739.08, -56 )
## 2 ( -13.49, -51.52, -21.86, -490.74, -15.08 )
## 3 ( -2.24, -8.59, -3.55, -81.41, -3.15 )
## 4 ( -0.11, -0.41, -0.17, -3.89, -0.19 )
## 5 ( 0, 0, 0, -0.01, 0 )
## 6 ( 0, 0, 0, 0, 0 )
## 7 ( 0, 0, 0, 0, 0 )
b1 = B1[1]
b2 = B1[2]
b3 = B1[3]
b4 = B1[4]
b5 = B1[5]
df_cuadro_comparacion[,"Scoring.Fisher"] = c(b1,b2,b3,b4,b5)
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS Metropolis.Hasting
## 1 (Intercept) 5.29215150 5.30700031 5.29215150 0 0
## 2 Pclass -1.15049134 -1.12496995 -1.15049134 0 0
## 3 Sex -2.67664817 -2.80118718 -2.67664817 0 0
## 4 Age -0.04454008 -0.04383097 -0.04454008 0 0
## 5 SibSp -0.39346077 -0.41496568 -0.39346077 0 0
Se utiliza el algoritmo IRLS que se apoya en el método de Mínimo Cuadrados Poderados. Es un método iterativo que busca la convergencia del vector \(\beta\) mediante el ajuste de la variable de respuesta en una nueva variable \(Z\) y el uso de uma matriz de ponderación \(W\).
# ==============================================================================
# Método 4: IRLS
# ==============================================================================
fRespuestaAjustadaZ = function(B,Y,XX){
nrow = nrow(Y)
prow = nrow(B)
Z = matrix(0, nrow = nrow, ncol = 1)
for (i in 1:nrow) {
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
Ui = as.numeric((exp(t(Xi)%*%B))/((1 + exp(t(Xi)%*%B))))
dgi = 1/(Ui*(1-Ui))
Z[i,1] = t(Xi)%*%B + dgi*(Yi - Ui)
}
return(Z)
}
fMatrizPonderacionW = function(B,Y,XX){
nrow = nrow(Y)
prow = nrow(B)
W = matrix(0, nrow = nrow, ncol = nrow )
diagonal =c()
for (i in 1:nrow) {
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
Ui = as.numeric((exp(t(Xi)%*%B))/((1 + exp(t(Xi)%*%B))))
dgi = 1/(Ui*(1-Ui))
W[i,i] = 1/dgi
}
return(W)
}
# Vector inicial Bk = (0,0,0,0)'
B0 = matrix(0, nrow = ncol(XX), ncol = 1)
B1 = B0
epsilon = 0.000000001
i = 1
df_resultado=
data.frame (Iteracion = i,
Estimacion = paste(paste("(" ,toString(round(B1,6) )),")"),
Control = c(''))
while (( fNorma ( B1,B0) >= epsilon) | (i == 1)){
B0 = B1
i = i + 1
# Ajustando la respuesta
Z1 = fRespuestaAjustadaZ(B0,Y,XX)
# Obteniendo la matriz de ponderaciones
W = fMatrizPonderacionW(B0,Y,XX)
#Mínimos Cuadrados ponderados
B1 = (solve((t(XX)%*%W)%*%XX))%*%((t(XX)%*%W)%*%Z1)
df_iteracion =
data.frame(Iteracion = i,
Estimacion = paste(paste("(" ,toString(round(B1,6) )),")"),
Control = fNorma ( B1,B0))
df_resultado = rbind(df_resultado, df_iteracion)
}
print(df_resultado)
## Iteracion Estimacion
## 1 1 ( 0, 0, 0, 0, 0 )
## 2 2 ( 3.44567, -0.729307, -2.015601, -0.026657, -0.219065 )
## 3 3 ( 4.905973, -1.064244, -2.541895, -0.040734, -0.349482 )
## 4 4 ( 5.272813, -1.14628, -2.669792, -0.04435, -0.390766 )
## 5 5 ( 5.292099, -1.15048, -2.676629, -0.04454, -0.393452 )
## 6 6 ( 5.292151, -1.150491, -2.676648, -0.04454, -0.393461 )
## 7 7 ( 5.292151, -1.150491, -2.676648, -0.04454, -0.393461 )
## Control
## 1
## 2 4.06397340202227
## 3 1.5933798757421
## 4 0.399220041092303
## 5 0.0210617538153838
## 6 5.72824796852323e-05
## 7 4.55076831072024e-10
b1 = B1[1]
b2 = B1[2]
b3 = B1[3]
b4 = B1[4]
b5 = B1[5]
df_cuadro_comparacion[,"IRLS"] = c(b1,b2,b3,b4,b5)
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS
## 1 (Intercept) 5.29215150 5.30700031 5.29215150 5.29215150
## 2 Pclass -1.15049134 -1.12496995 -1.15049134 -1.15049134
## 3 Sex -2.67664817 -2.80118718 -2.67664817 -2.67664817
## 4 Age -0.04454008 -0.04383097 -0.04454008 -0.04454008
## 5 SibSp -0.39346077 -0.41496568 -0.39346077 -0.39346077
## Metropolis.Hasting
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
Este método utiliza una Cadena de Markov para estimar el vector de parámetros \(\beta\) a través de muestra cuando luego de lograr la convergencia de la distribución aposteriori.
# ==============================================================================
# Método 5: Metropolis-Hasting
# ==============================================================================
#------------------------------------------------------------------------------
# Funcion log de la Distribución Posteriori
#------------------------------------------------------------------------------
flogDistribucionPosteriori = function(Y,XX,B,sigma){
nrow = nrow(Y)
pcol = nrow(B)
sumaB = t(B)%*%B
logapriori = -sumaB/(2*(sigma^2))
logVerosimilitud = 0
for (i in 1:nrow){
Yi = as.numeric(Y[i,1])
Xi = as.matrix(XX[i,],prow,1)
exb = exp(-t(Xi)%*%B)
logVerosimilitud = logVerosimilitud + (Yi*log(1/(1 +exb))
+ (1 - Yi)*log(exb/(1 + exb)))
}
return(logVerosimilitud + logapriori)
}
# tamaño de muestra
M = 300000
K = max(c(0.05*M,1000))
S = max(c(0.20*K,500))
# Hiper parámetros
sigma1 = 4
sigma2 = 0.01
SIGMA2 = matrix(data = 0, nrow = ncol(XX), ncol = ncol(XX))
diag(SIGMA2) = sigma2
#------------------------------------------------------------------------------
# Metropolis Hasting
#------------------------------------------------------------------------------
BMetropolis = matrix(data = 0, nrow = M, ncol = ncol(XX))
salto = 0
# 1. Valor inicial
Bx = matrix(0,1,ncol = ncol(XX))
for( i in 1:M){
#2. Propuesta
By = mvtnorm::rmvnorm(n = 1, mean = Bx, sigma = SIGMA2)
#3. Tasa de Aceptación
logPIy = flogDistribucionPosteriori(Y,XX, t(By),sigma1)
logPIx = flogDistribucionPosteriori(Y,XX, t(Bx),sigma1)
logalphaxy = min(c( logPIy - logPIx,0))
alphaxy = exp(logalphaxy)
if (is.nan(alphaxy)) { alphaxy = 0.0}
#4. Aceptacion
if (runif(1) < alphaxy){
Bx = By
salto = salto + 1
}
BMetropolis[i, ] = Bx
#5. Progreso
if (i%%floor(M/10) == 0){
cat(100*round(i/M, 1), "% completado ... \n", sep = "" )
}
}
## 10% completado ...
## 20% completado ...
## 30% completado ...
## 40% completado ...
## 50% completado ...
## 60% completado ...
## 70% completado ...
## 80% completado ...
## 90% completado ...
## 100% completado ...
#Quemado: Descartando las (M - K) iteraciones
UltimosK = BMetropolis[(M-K + 1) : M,]
#Seleccionando MAS de los ultimo K vectores
muestra = UltimosK[sample(1:K,S),]
#------------------------------------------------------------------------------
# Diagnostico de convergencia de parámetros
#------------------------------------------------------------------------------
fDiagnosticoConvergencia = function (pMetropolis,pmuestra){
M = nrow(pMetropolis)
m = nrow(pmuestra)
lmedia = as.numeric(mean(pmuestra))
par(mfrow=c(2,2))
plot( pMetropolis,type="l",xlab = sprintf("Iteraciones: %.0f",M), ylab = expression(beta),
main = "Convergencia de parámetro")
plot( pmuestra,type="l",xlab = sprintf("Muestra: %.0f",m), ylab = expression(beta),
main = "Serie de la muestra")
acf(pmuestra,main = "Autocorrelación de la muestra")
hist(pmuestra,
prob = TRUE,
main = "Distribución de parámetro",
xlab = expression(beta),
ylab ="Densidad",
breaks = 100)
abline(v=lmedia, col='red', lwd=3)
par(mfrow=c(1,1))
return (lmedia)
}
A continuación se muestra el Analisis de Convergencia
# Analisis convergencia b1
b1 = fDiagnosticoConvergencia (BMetropolis[,1],muestra[,1])
# Analisis convergencia b2
b2 = fDiagnosticoConvergencia (BMetropolis[,2],muestra[,2])
# Analisis convergencia b3
b3 = fDiagnosticoConvergencia (BMetropolis[,3],muestra[,3])
# Analisis convergencia b4
b4 = fDiagnosticoConvergencia (BMetropolis[,4],muestra[,4])
# Analisis convergencia b5
b5 = fDiagnosticoConvergencia (BMetropolis[,5],muestra[,5])
df_cuadro_comparacion[,"Metropolis.Hasting"] = c(b1,b2,b3,b4,b5)
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS
## 1 (Intercept) 5.29215150 5.30700031 5.29215150 5.29215150
## 2 Pclass -1.15049134 -1.12496995 -1.15049134 -1.15049134
## 3 Sex -2.67664817 -2.80118718 -2.67664817 -2.67664817
## 4 Age -0.04454008 -0.04383097 -0.04454008 -0.04454008
## 5 SibSp -0.39346077 -0.41496568 -0.39346077 -0.39346077
## Metropolis.Hasting
## 1 5.13009656
## 2 -1.11276329
## 3 -2.63775858
## 4 -0.04343847
## 5 -0.38165576
El siguiente cuadro muestra los resultados obtenidos por los 5 métodos. Se aprecia que los resultados son similares.
print(df_cuadro_comparacion)
## Variable R.glm R.optim Scoring.Fisher IRLS
## 1 (Intercept) 5.29215150 5.30700031 5.29215150 5.29215150
## 2 Pclass -1.15049134 -1.12496995 -1.15049134 -1.15049134
## 3 Sex -2.67664817 -2.80118718 -2.67664817 -2.67664817
## 4 Age -0.04454008 -0.04383097 -0.04454008 -0.04454008
## 5 SibSp -0.39346077 -0.41496568 -0.39346077 -0.39346077
## Metropolis.Hasting
## 1 5.13009656
## 2 -1.11276329
## 3 -2.63775858
## 4 -0.04343847
## 5 -0.38165576