Usando el dataset de Titanic, realizar una comparación entre diferentes enfoques para clasificación (regresión logistica, lda, knn y otros) comparar utilizando los valores del costo y de gamma.
Usar la clase de la Semana #7 como referencia.
library(e1071)
library(dplyr)
library(ISLR)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(readr)
library(caret)
library(tidyverse)
titanic <- read.csv("titanic.csv")
Limpiar data
titanic_clean <- titanic%>% select(-PassengerId,
-Name,
-Ticket,
-Cabin )
titanic_clean$Survived <- factor(titanic_clean$Survived)
titanic_clean$Pclass <- factor(titanic_clean$Pclass)
titanic_clean$Sex <- factor(titanic_clean$Sex)
summary(titanic_clean)
Survived Pclass Sex Age
0:549 1:216 female:314 Min. : 0.42
1:342 2:184 male :577 1st Qu.:22.00
3:491 Median :28.00
Mean :29.36
3rd Qu.:35.00
Max. :80.00
SibSp Parch Fare Embarked
Min. :0.000 Min. :0.0000 Min. : 0.00 : 2
1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 7.91 C:168
Median :0.000 Median :0.0000 Median : 14.45 Q: 77
Mean :0.523 Mean :0.3816 Mean : 32.20 S:644
3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.: 31.00
Max. :8.000 Max. :6.0000 Max. :512.33
str(titanic_clean)
'data.frame': 891 obs. of 8 variables:
$ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 28 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Fare : num ...
Se creó la regresión logística encontrando que el Cut-off ideal 0.61, con una especificidad de 0.948717949 y una sensibilidad de .0.5892857
train_glm_index <- sample(1:nrow(titanic_clean),nrow(titanic_clean)*0.7)
train_glm<-titanic_clean[train_glm_index,]
test_glm <- titanic_clean[-train_glm_index,]
glm_titanic = glm(Survived~., data=train_glm, family = "binomial")
pred_prob<- predict(glm_titanic, test_glm, type="response")
roc_data<- function(){
roc_out<- data_frame(especificidad=0,sensibilidad=0, cutoff=0)
for(i in seq(0.01,0.95, by=0.01)){
pred <- ifelse(pred_prob>i, 1, 0)
cm<- table(pred, Original = test_glm$Survived)
(sensibilidad <- cm[2,2]/sum(cm[,2]))
(especificidad <- cm[1,1]/sum(cm[,1]))
roc_out <-rbind(roc_out, c(especificidad, sensibilidad,i))
}
return(roc_out[-1,])
}
View(roc_plot_data)
plot(roc_plot_data[,1:2], type= "l")
Se calculó las SVM y se encontró que el modelo con mayor accuracy es con Gamma =1 y Cost = 2.
for (i in 1:10){
for (j in 1:10){
svm_titanic <-svm(Survived~., data = train, gamma = i, cost = j)
pred <- predict(svm_titanic, test)
cm<- table (pred, original = test$Survived)
accuracy <- (cm[1,1]+cm[2,2])/sum(cm)
svm_data <- cbind(gamma = i, cost = j, accuracy)
}
}
for (i in 1:10){
for (j in 1:10){
svm_titanic <-svm(Survived~., data = train, gamma = i, cost = j)
pred <- predict(svm_titanic, test)
cm<- table (pred, original = test$Survived)
accuracy <- (cm[1,1]+cm[2,2])/sum(cm)
svm_data <- cbind(gamma = i, cost = j, accuracy)
}
}