Nombre: Noboa Andres
Aplicacion en el Titanic Dataset
#install.packages("titanic")
#install.packages("tidyverse")
Cargando la libreria
library(titanic)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
library(rpart.plot)
require(ggplot2)
# Cargar data
data('titanic_train')
str(titanic_train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
ggplot(titanic_test,aes(x=factor(Pclass),fill=factor(Sex)),title(" "))+
geom_bar(position="dodge") +
ggtitle("Conteo de Genero segun clase de Pasajero")
# rpart
tree <- rpart(formula = Survived ~ Sex + Age,
data = titanic_train,
method = 'class')
fancyRpartPlot(tree)
Feature Importance en Decision Tree
summary(tree)
## Call:
## rpart(formula = Survived ~ Sex + Age, data = titanic_train, method = "class")
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.44444444 0 1.0000000 1.0000000 0.04244576
## 2 0.02339181 1 0.5555556 0.5555556 0.03574957
## 3 0.01000000 2 0.5321637 0.5584795 0.03581795
##
## Variable importance
## Sex Age
## 92 8
##
## Node number 1: 891 observations, complexity param=0.4444444
## predicted class=0 expected loss=0.3838384 P(node) =1
## class counts: 549 342
## probabilities: 0.616 0.384
## left son=2 (577 obs) right son=3 (314 obs)
## Primary splits:
## Sex splits as RL, improve=124.426300, (0 missing)
## Age < 6.5 to the right, improve= 8.814172, (177 missing)
##
## Node number 2: 577 observations, complexity param=0.02339181
## predicted class=0 expected loss=0.1889081 P(node) =0.647587
## class counts: 468 109
## probabilities: 0.811 0.189
## left son=4 (553 obs) right son=5 (24 obs)
## Primary splits:
## Age < 6.5 to the right, improve=10.78893, (124 missing)
##
## Node number 3: 314 observations
## predicted class=1 expected loss=0.2579618 P(node) =0.352413
## class counts: 81 233
## probabilities: 0.258 0.742
##
## Node number 4: 553 observations
## predicted class=0 expected loss=0.1681736 P(node) =0.620651
## class counts: 460 93
## probabilities: 0.832 0.168
##
## Node number 5: 24 observations
## predicted class=1 expected loss=0.3333333 P(node) =0.02693603
## class counts: 8 16
## probabilities: 0.333 0.667
Se observa que la feature Sex tiene una gran cantidad de
importancia en la prediccion.
my_prediction <- predict(tree,
data.frame(Age=4,Sex='male'),
type="class")
my_prediction
## 1
## 1
## Levels: 0 1