Es una base de datos que contiene 12 columnas y 891 filas.
train <- read.csv("C:/Users/christian.figueroa/Desktop/Actividad 3 regresion/train.csv", sep=";")
library(dplyr)
library(ggplot2)
library(dbplyr)
library(dplyr)
library(tidyr)
library(cowplot)
library(stringr)
library(corrplot)
library(ggcorrplot)
library(tidyverse)
library(MASS)
library(car)
library(e1071)
library(caret)
library(caTools)
library(pROC)
options(repr.plot.width = 6, repr.plot.height = 4)
missing_data <- train %>% summarise_all(funs(sum(is.na(.))/n()))
missing_data <- gather(missing_data, key = "variables", value = "percent_missing")
ggplot(missing_data, aes(x = reorder(variables, percent_missing), y = percent_missing)) +
geom_bar(stat = "identity", fill = "red", aes(color = I('white')), size = 0.3)+
xlab(
'variables')+
coord_flip()+
theme_bw()
train$Pclass <- as.factor(train$Pclass)
train$Survived <- as.factor(train$Survived)
train$sex <- as.factor(train$Sex)
train$Embarked <- as.factor(train$Embarked)
train <- subset(train, select= -Cabin)
train <- subset(train, select= -Ticket)
options(repr.plot.width = 6, repr.plot.height = 4)
train %>%
group_by(Survived) %>%
summarise(Count = n())%>%
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Survived, -percent), percent), fill = Survived)+
geom_col(fill = c("#FC4E07", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+
theme_bw()+
xlab("Survived") +
ylab("Percent")+
ggtitle("Survived Percent")
options(repr.plot.width = 8, repr.plot.height = 5)
plot_grid(ggplot(train, aes(x=Pclass,fill=Survived))+ geom_bar(position = 'fill')+ theme_bw(), ggplot(train, aes(x=Sex,fill=Survived))+ geom_bar(position = 'fill')+ theme_bw(), ggplot(train, aes(x=Embarked,fill=Survived))+ geom_bar(position = 'fill')+theme_bw()+ scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = "h")
options(repr.plot.width = 8, repr.plot.height = 5)
plot_grid(ggplot(train, aes(x=SibSp,fill=Survived))+ geom_bar(position = 'fill')+theme_bw(), ggplot(train, aes(x=Parch,fill=Survived))+ geom_bar(position = 'fill')+theme_bw()+ scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = "h")
options(repr.plot.width =10, repr.plot.height = 5)
ggplot(train, aes(y= Age, x = "", fill = Survived)) +
geom_boxplot()+
theme_bw()+
xlab(" ")
## Warning: Removed 177 rows containing non-finite values (stat_boxplot).
##### Parece no existir diferencias significativas en relación con la variable edad.
train <- subset( train, select = -Age )
num_columns <- c("SibSp", "Parch", "Fare")
train[num_columns]<- sapply(train[num_columns], as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
train_int <- train[,c("SibSp", "Parch", "Fare")]
train_int <- data.frame(scale(train_int))
head(train_int)
## SibSp Parch Fare
## 1 0.4325504 -0.4734077 -0.52526810
## 2 0.4325504 -0.4734077 3.88486027
## 3 -0.4742788 -0.4734077 -0.52104913
## 4 0.4325504 -0.4734077 -0.23869036
## 5 -0.4742788 -0.4734077 -0.52026784
## 6 -0.4742788 -0.4734077 -0.04191114
train_cat <- train[,-c(1, 4, 6, 7, 8)]
dummy<- data.frame(sapply(train_cat,function(x) data.frame(model.matrix(~x-1,data =train_cat))[,-1]))
train_final <- cbind(train_int,dummy)
head(train_final)
## SibSp Parch Fare Survived Pclass.x2 Pclass.x3 Sex
## 1 0.4325504 -0.4734077 -0.52526810 0 0 1 1
## 2 0.4325504 -0.4734077 3.88486027 1 0 0 0
## 3 -0.4742788 -0.4734077 -0.52104913 1 0 1 0
## 4 0.4325504 -0.4734077 -0.23869036 1 0 0 0
## 5 -0.4742788 -0.4734077 -0.52026784 0 0 1 1
## 6 -0.4742788 -0.4734077 -0.04191114 0 0 1 1
## Embarked.xQ Embarked.xS sex
## 1 0 1 1
## 2 0 0 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 1
## 6 1 0 1
set.seed(123)
indices = sample.split(train$Survived, SplitRatio = 0.7)
train1 = train_final[indices,]
test = train_final[!(indices),]
modelo_1 = glm(Survived ~ ., data = train1, family = "binomial")
summary(modelo_1)
##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4895 -0.6958 -0.4256 0.6124 2.4580
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.5173 0.3705 6.795 1.09e-11 ***
## SibSp -0.2354 0.1277 -1.843 0.0653 .
## Parch -0.1313 0.1131 -1.161 0.2455
## Fare 0.2954 0.1361 2.171 0.0299 *
## Pclass.x2 -0.3237 0.3234 -1.001 0.3169
## Pclass.x3 -1.4551 0.2903 -5.012 5.40e-07 ***
## Sex -2.7946 0.2418 -11.558 < 2e-16 ***
## Embarked.xQ -0.2466 0.4868 -0.507 0.6125
## Embarked.xS -0.6437 0.3084 -2.087 0.0369 *
## sex NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 806.42 on 608 degrees of freedom
## Residual deviance: 541.28 on 600 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 559.28
##
## Number of Fisher Scoring iterations: 5
modelo_2 <- stepAIC(modelo_1, direction="both")
## Start: AIC=559.28
## Survived ~ SibSp + Parch + Fare + Pclass.x2 + Pclass.x3 + Sex +
## Embarked.xQ + Embarked.xS + sex
##
##
## Step: AIC=559.28
## Survived ~ SibSp + Parch + Fare + Pclass.x2 + Pclass.x3 + Sex +
## Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - Embarked.xQ 1 541.54 557.54
## - Pclass.x2 1 542.28 558.28
## - Parch 1 542.66 558.66
## <none> 541.28 559.28
## - SibSp 1 545.10 561.10
## - Embarked.xS 1 545.59 561.59
## - Fare 1 546.13 562.13
## - Pclass.x3 1 566.93 582.93
## - Sex 1 711.24 727.24
##
## Step: AIC=557.54
## Survived ~ SibSp + Parch + Fare + Pclass.x2 + Pclass.x3 + Sex +
## Embarked.xS
##
## Df Deviance AIC
## - Pclass.x2 1 542.61 556.61
## - Parch 1 542.78 556.78
## <none> 541.54 557.54
## + Embarked.xQ 1 541.28 559.28
## - SibSp 1 545.41 559.41
## - Embarked.xS 1 546.11 560.11
## - Fare 1 547.02 561.02
## - Pclass.x3 1 569.24 583.24
## - Sex 1 711.93 725.93
##
## Step: AIC=556.61
## Survived ~ SibSp + Parch + Fare + Pclass.x3 + Sex + Embarked.xS
##
## Df Deviance AIC
## - Parch 1 543.97 555.97
## <none> 542.61 556.61
## + Pclass.x2 1 541.54 557.54
## + Embarked.xQ 1 542.28 558.28
## - SibSp 1 546.43 558.43
## - Embarked.xS 1 547.82 559.82
## - Fare 1 550.34 562.34
## - Pclass.x3 1 578.50 590.50
## - Sex 1 712.71 724.71
##
## Step: AIC=555.97
## Survived ~ SibSp + Fare + Pclass.x3 + Sex + Embarked.xS
##
## Df Deviance AIC
## <none> 543.97 555.97
## + Parch 1 542.61 556.61
## + Pclass.x2 1 542.78 556.78
## + Embarked.xQ 1 543.80 557.80
## - Embarked.xS 1 549.72 559.72
## - SibSp 1 550.22 560.22
## - Fare 1 551.02 561.02
## - Pclass.x3 1 580.13 590.13
## - Sex 1 716.75 726.75
summary(modelo_2)
##
## Call:
## glm(formula = Survived ~ SibSp + Fare + Pclass.x3 + Sex + Embarked.xS,
## family = "binomial", data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3859 -0.7068 -0.4159 0.6046 2.3929
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2552 0.2954 7.634 2.27e-14 ***
## SibSp -0.2816 0.1224 -2.301 0.02138 *
## Fare 0.3250 0.1256 2.587 0.00967 **
## Pclass.x3 -1.3025 0.2232 -5.836 5.35e-09 ***
## Sex -2.7032 0.2292 -11.796 < 2e-16 ***
## Embarked.xS -0.6179 0.2570 -2.404 0.01621 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 806.42 on 608 degrees of freedom
## Residual deviance: 543.97 on 603 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 555.97
##
## Number of Fisher Scoring iterations: 5
vif(modelo_2)
## SibSp Fare Pclass.x3 Sex Embarked.xS
## 1.088383 1.100763 1.076532 1.101108 1.095943
anova(modelo_2, modelo_1)
## Analysis of Deviance Table
##
## Model 1: Survived ~ SibSp + Fare + Pclass.x3 + Sex + Embarked.xS
## Model 2: Survived ~ SibSp + Parch + Fare + Pclass.x2 + Pclass.x3 + Sex +
## Embarked.xQ + Embarked.xS + sex
## Resid. Df Resid. Dev Df Deviance
## 1 603 543.97
## 2 600 541.28 3 2.6919
pchisq(2.6919, df=3, lower.tail=F)
## [1] 0.4416056
pred <- predict(modelo_2, type = "response", newdata = test[,-24])
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01310 0.09446 0.25136 0.36207 0.61942 0.97808 9
test$prob <- pred
# Usando 50% como punto de corte
pred_Survived <- factor(ifelse(pred >= 0.5, "Yes", "No"))
actual_Survived <- factor(ifelse(test$Survived==1,"Yes","No"))
table(actual_Survived,pred_Survived)
## pred_Survived
## actual_Survived No Yes
## No 138 26
## Yes 33 62
cutoff_survived <- factor(ifelse(pred >=0.5, "Yes", "No"))
conf_final <- confusionMatrix(cutoff_survived, actual_Survived, positive = "Yes")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy
## Accuracy
## 0.7722008
sensitivity
## Sensitivity
## 0.6526316
specificity
## Specificity
## 0.8414634
perform_fn <- function(cutoff)
{ predicted_survived <- factor(ifelse(pred >= cutoff, "Yes", "No"))
conf <- confusionMatrix(predicted_survived, actual_Survived, positive = "Yes")
accuray <- conf$overall[1]
sensitivity <- conf$byClass[1]
specificity <- conf$byClass[2]
out <- t(as.matrix(c(sensitivity, specificity, accuray)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)}
options(repr.plot.width =8, repr.plot.height =6)
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01310 0.09446 0.25136 0.36207 0.61942 0.97808 9
s = seq(0.01,0.80,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Valor",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),
type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"darkgreen",4,"darkred"),text.font =3,inset = 0.02,
box.lty=0,cex = 0.8,
lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
abline(v = 0.5, col="red", lwd=1, lty=2)
axis(1, at = seq(0.1, 1, by = 0.1))
glm.roc <- roc(response = test$Survived, predictor = as.numeric(pred))
plot(glm.roc, legacy.axes = TRUE, print.auc.y = 1.0, print.auc = TRUE)
pred <- predict(modelo_2, type = "response", newdata = test[,-24])
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01310 0.09446 0.25136 0.36207 0.61942 0.97808 9
test$prob <- pred
# Usando 50% como punto de corte
pred_Survived <- factor(ifelse(pred >= 0.26, "Yes", "No"))
actual_Survived <- factor(ifelse(test$Survived==1,"Yes","No"))
table(actual_Survived,pred_Survived)
## pred_Survived
## actual_Survived No Yes
## No 123 41
## Yes 26 69
cutoff_survived <- factor(ifelse(pred >=0.26, "Yes", "No"))
conf_final <- confusionMatrix(cutoff_survived, actual_Survived, positive = "Yes")
accuracy1 <- conf_final$overall[1]
sensitivity1 <- conf_final$byClass[1]
specificity1 <- conf_final$byClass[2]
accuracy1
## Accuracy
## 0.7413127
sensitivity1
## Sensitivity
## 0.7263158
specificity1
## Specificity
## 0.75
perform_fn <- function(cutoff)
{ predicted_survived <- factor(ifelse(pred >= cutoff, "Yes", "No"))
conf <- confusionMatrix(predicted_survived, actual_Survived, positive = "Yes")
accuray <- conf$overall[1]
sensitivity <- conf$byClass[1]
specificity <- conf$byClass[2]
out <- t(as.matrix(c(sensitivity, specificity, accuray)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)}
options(repr.plot.width =8, repr.plot.height =6)
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01310 0.09446 0.25136 0.36207 0.61942 0.97808 9
s = seq(0.01,0.80,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Valor",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),
type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"darkgreen",4,"darkred"),text.font =3,inset = 0.02,
box.lty=0,cex = 0.8,
lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
abline(v = 0.26, col="red", lwd=1, lty=2)
axis(1, at = seq(0.1, 1, by = 0.1))
glm.roc <- roc(response = test$Survived, predictor = as.numeric(pred))
plot(glm.roc, legacy.axes = TRUE, print.auc.y = 1.0, print.auc = TRUE)
El modelo con un 0.26 parece ser más equlibrado que el modelo con 0.5.