titanic.R

### Competição Kaagle, TITANIC

#Carregando Datasets
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)


#Criando Dataset test.survived para adicionar ao train
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[ , ])

#Combine Datasets
data.combined <- rbind(train, test.survived)

#Verificando Data.combined
str(data.combined)

## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : chr  "0" "1" "1" "1" ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

#Tranformando a variavel Survived e Pclass em fatores
data.combined$Survived <- as.factor(data.combined$Survived)
data.combined$Pclass <- as.factor(data.combined$Pclass)

#Verificando fator Survived
table(data.combined$Survived)

## 
##    0    1 None 
##  549  342  418

#Verificando a distribuição dos passageiros, pela classes
table(data.combined$Pclass)

## 
##   1   2   3 
## 323 277 709

#Visualizações
library(ggplot2)

#Hipotese: Ricos sobreviveram em uma taxa maior
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
  geom_bar() +
  ggtitle("Hipotese: Ricos sobreviveram em uma taxa maior") +
  xlab("Pclass") +
  ylab("Total Count") +
  labs(fill = "Survived")

#Hipotese: Mulhers sobreviveram em uma taxa maior
ggplot(train, aes(x = Sex, fill = factor(Survived))) +
  geom_bar() +
  ggtitle("Hipotese: Mulhers sobreviveram em uma taxa maior") +
  xlab("Sex") +
  ylab("Total Count") +
  labs(fill = "Survived")

#Hipotese: Passageiros que embarcaram em determinado local sobreviveram em maior taxa
ggplot(train, aes(x = Embarked, fill = factor(Survived))) +
  geom_bar() +
  ggtitle("Hipotese: Passageiros que embarcaram em determinado local sobreviveram em maior taxa") +
  xlab("Local de Embarque  (C = Cherbourg; Q = Queenstown; S = Southampton)") +
  ylab("Total Count") +
  labs(fill = "Survived")

#Examinando os primeiros nomes no dataset Train
head(as.character(train$Name))

## [1] "Braund, Mr. Owen Harris"                            
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"                             
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"       
## [5] "Allen, Mr. William Henry"                           
## [6] "Moran, Mr. James"

#Quantos nomes únicos existem entre os datasets, Train e test?
length(unique(as.character(data.combined$Name)))

## [1] 1307

#Temos 1307 nomes únicos em um dataset com 1309 registros, 2 nomes repetidos
#Extraindo e armazenando os nomes dúplicados
dup.names <-  as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])

#Verificando os registros duplicados
data.combined[which(duplicated(as.character(data.combined$Name))), ]

##     PassengerId Survived Pclass                 Name    Sex  Age SibSp
## 892         892     None      3     Kelly, Mr. James   male 34.5     0
## 898         898     None      3 Connolly, Miss. Kate female 30.0     0
##     Parch Ticket   Fare Cabin Embarked
## 892     0 330911 7.8292              Q
## 898     0 330972 7.6292              Q

#Sobre "Miss." e "Mr."
library(stringr)

#Alguma correlação com outras variáveis?
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")), ]
View(misses[1:5, ])

#Hipotese: "Mr." é correlacionado à idade?
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
View(mrses[1:5, ])

#Verificando os homens, em busca de padrão
males <- data.combined[which(train$Sex == "male"), ]
View(males[1:5, ])

#Tentando entender o Título "Master."
data.combined[which(data.combined$Title=="Master."), ]

##  [1] PassengerId Survived    Pclass      Name        Sex        
##  [6] Age         SibSp       Parch       Ticket      Fare       
## [11] Cabin       Embarked   
## <0 rows> (or 0-length row.names)

#Parece que Master é um titulo atribuido à crianças


#Estendendo o relacionamento entre sobreviventes e classe, adicionando uma nova variável, titulo ...
#ao datase, explorando o realciomento 3-dimensional entre as variáveis


#Criando uma função para extrair o título
extractTitle <- function(name) {
  name <- as.character(name)
  
  if (length(grep("Miss.", name)) > 0) {
    return ("Miss.")
  } else if (length(grep("Master.", name)) > 0) {
    return ("Master.")
  } else if (length(grep("Mrs.", name)) > 0) {
    return ("Mrs.")
  } else if (length(grep("Mr.", name)) > 0) {
    return ("Mr.")
  } else {
    return ("Other")
  }
}

Titles <- NULL
for (i in 1:nrow(data.combined)) {
  Titles <- c(Titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$Title <- as.factor(Titles)

#Verificando distribuição dos títulos
table(Titles)

## Titles
## Master.   Miss.     Mr.    Mrs.   Other 
##      61     260     758     199      31

#Verificando Dataset
View(data.combined[1:10, ])

ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) +
  ggtitle("Sobreviventes por Classe e Títulos") +
  xlab("Titulos") +
  ylab("Total Count") +
  labs(fill = "Survived")

#Visualizando em 3-dimensões o relacionamento entre, sexo, classe e sobreviventes, e comparando
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) +
  ggtitle("Sobreviventes por Classe e Sexo") +
  xlab("Sex") +
  ylab("Total Count") +
  labs(fill = "Survived")

#Investigando as idades
summary(data.combined$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

#Sobreviventes por idade
ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) +
  facet_wrap(~Sex + Pclass) +
  geom_histogram(binwidth = 10) +
  ggtitle("Sobreviventes por idade") +
  xlab("Age") +
  ylab("Total Count")

## Warning: Removed 177 rows containing non-finite values (stat_bin).

#Validando a teoria de que "Master" é o titulo usado para meninos
boys <- data.combined[which(data.combined$Title == "Master."), ]
summary(boys$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.330   2.000   4.000   5.483   9.000  14.500       8

#Estudando "Miss."
misses <- data.combined[which(data.combined$Title == "Miss."), ]
summary(misses$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   15.00   22.00   21.77   30.00   63.00      50

ggplot(misses[misses$Survived != "None", ], aes(x = Age, fill = Survived)) +
  facet_wrap(~Pclass) +
  geom_histogram(binwidth = 5) +
  ggtitle("Idade para Miss. por Classe") +
  xlab("Age") +
  ylab("Total Count")

## Warning: Removed 36 rows containing non-finite values (stat_bin).

#Meninas possuem uma taxa de sobrevivencia diferente
#Vamos armazenar estas informações
misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0), ]
summary(misses.alone$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.00   21.00   26.00   27.23   32.50   58.00      33

#Apenas 4 meninas até 14 anos e meio viajando sozinhas
length(which(misses.alone$Age <= 14.5))

## [1] 4

#Estudando a variável "Sibsp", irmãos e conjunges
summary(data.combined$SibSp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4989  1.0000  8.0000

#Verificando se podemos tratar "Sibsp" como fator
length(unique(data.combined$SibSp))

## [1] 7

#Existe apenas 7 valores unicos, então podemos tratar como fator
data.combined$SibSp <- as.factor(data.combined$SibSp)

table(data.combined$SibSp)

## 
##   0   1   2   3   4   5   8 
## 891 319  42  20  22   6   9

ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) +
  stat_count(width = 1) +
  facet_wrap(~Pclass + Title) + 
  ggtitle("Classe e Título") +
  xlab("Irmãos e Conjuges") +
  ylab("Total Count") +
  ylim(0,300) +
  labs(fill = "Survived")

#Notamos alguma tendência na maior taxa de sobrevivência em familias menores
#Vamos criar uma variavel "familysize"
temp.sibsp <- c(train$SibSp, test$SibSp)
temp.parch <- c(train$Parch, test$Parch)
data.combined$Family.size <- as.factor(temp.sibsp + temp.parch + 1)


#Verificando se o tamanho da Familia tem poder de predição
ggplot(data.combined[1:891,], aes(x = Family.size, fill = Survived)) +
  stat_count(width = 1) +
  facet_wrap(~Pclass + Title) + 
  ggtitle("Classe e Titulo") +
  xlab("fTamanho da Família") +
  ylab("Total Count") +
  ylim(0,300) +
  labs(fill = "Survived")

#Verificando a variável Ticket
str(data.combined$Ticket)

##  Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...

#Baseado no grande número de fatores da Variàvel Ticket, não é um fator, é uma string, vamos converte-la
data.combined$Ticket <- as.character(data.combined$Ticket)
data.combined$Ticket[1:20]

##  [1] "A/5 21171"        "PC 17599"         "STON/O2. 3101282"
##  [4] "113803"           "373450"           "330877"          
##  [7] "17463"            "349909"           "347742"          
## [10] "237736"           "PP 9549"          "113783"          
## [13] "A/5. 2151"        "347082"           "350406"          
## [16] "248706"           "382652"           "244373"          
## [19] "345763"           "2649"

#Tentando encontrar algum padrão, primeiramente olhando os primeiros caracteres
ticket.first.char <- ifelse(data.combined$Ticket == "", " ", substr(data.combined$Ticket, 1, 1))
unique(ticket.first.char)

##  [1] "A" "P" "S" "1" "3" "2" "C" "7" "W" "4" "F" "L" "9" "6" "5" "8"

#Criando um fator, com objetivo de visualização
data.combined$Ticket.first.char <-as.factor(ticket.first.char)

str(data.combined)

## 'data.frame':    1309 obs. of  15 variables:
##  $ PassengerId      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived         : Factor w/ 3 levels "0","1","None": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass           : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name             : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex              : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age              : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp            : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
##  $ Parch            : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket           : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare             : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin            : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked         : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
##  $ Title            : Factor w/ 5 levels "Master.","Miss.",..: 3 4 2 4 3 3 3 1 4 4 ...
##  $ Family.size      : Factor w/ 9 levels "1","2","3","4",..: 2 2 1 2 1 1 1 5 3 2 ...
##  $ Ticket.first.char: Factor w/ 16 levels "1","2","3","4",..: 10 14 15 1 3 3 1 3 3 2 ...

#Primeiro, um overview dos dados
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
  geom_bar() +
  ggtitle("Sobrevivencia pelo primeiro caracter do Ticket") +
  xlab("Primeiro caracter do Ticket") +
  ylab("Contagem") +
  labs(fill = "Survived")

#Sobrevivência pelo Ticket e por classes
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) +
  ggtitle("Sobrevivencia pelo primeiro caracter do Ticket") +
  xlab("Primeiro caracter do Ticket") +
  ylab("Contagem") +
  ylim(0, 150) +
  labs(fill = "Survived")

#Procurando um padrão, ao combinar Classe e Titulo
ggplot(data.combined[1:891,], aes(x = Ticket.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + Title) +
  ggtitle("Classe e Título") +
  xlab("Primeiro caracter do Ticket") +
  ylab("Contagem") +
  ylim(0, 200) +
  labs(fill = "Survived")

#Proximo passo, analisando as tarifas pagas pelos passageiros
summary(data.combined$Fare)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.896  14.450  33.300  31.280 512.300       1

length(unique(data.combined$Fare))

## [1] 282

#Não podemos transformar em fator, assim, vamos tratar como numerico e visulizar um histograma
ggplot(data.combined[1:891,], aes(x = Fare, fill = Survived)) +
  geom_histogram(binwidth = 5) +
  ggtitle("Distribuição por tarifas") +
  xlab("Tarifa") +
  ylab("Contagem") +
  ylim(0, 200)

#Verificando se Tarifa tem poder de predição
ggplot(data.combined[1:891,], aes(x = Fare, fill = Survived)) +
  geom_histogram(binwidth = 5) +
  facet_wrap(~Pclass + Title) +
  ggtitle("Distribuição por tarifas, por Classe e Titulo") +
  xlab("Tarifa") +
  ylab("Contagem") +
  ylim(0, 50) +
  labs(fill = "Survived")

#Analisando a variável cabine
str(data.combined$Cabin)

##  Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...

data.combined$Cabin[1:100]

##   [1]             C85                     C123                   
##   [6]             E46                                            
##  [11] G6          C103                                           
##  [16]                                                            
##  [21]             D56                     A6                     
##  [26]                         C23 C25 C27                        
##  [31]             B78                                            
##  [36]                                                            
##  [41]                                                            
##  [46]                                                            
##  [51]                         D33                     B30        
##  [56] C52                                                        
##  [61]             B28         C83                                
##  [66]             F33                                            
##  [71]                                                            
##  [76] F G73                                                      
##  [81]                                                            
##  [86]                                     C23 C25 C27            
##  [91]                         E31                                
##  [96]             A5          D10 D12                            
## 187 Levels:  A10 A14 A16 A19 A20 A23 A24 A26 A31 A32 A34 A36 A5 A6 ... F E57

#Não é um fator, vamos transformar em string
data.combined$Cabin <- as.character(data.combined$Cabin)
data.combined$Cabin[1:100]

##   [1] ""            "C85"         ""            "C123"        ""           
##   [6] ""            "E46"         ""            ""            ""           
##  [11] "G6"          "C103"        ""            ""            ""           
##  [16] ""            ""            ""            ""            ""           
##  [21] ""            "D56"         ""            "A6"          ""           
##  [26] ""            ""            "C23 C25 C27" ""            ""           
##  [31] ""            "B78"         ""            ""            ""           
##  [36] ""            ""            ""            ""            ""           
##  [41] ""            ""            ""            ""            ""           
##  [46] ""            ""            ""            ""            ""           
##  [51] ""            ""            "D33"         ""            "B30"        
##  [56] "C52"         ""            ""            ""            ""           
##  [61] ""            "B28"         "C83"         ""            ""           
##  [66] ""            "F33"         ""            ""            ""           
##  [71] ""            ""            ""            ""            ""           
##  [76] "F G73"       ""            ""            ""            ""           
##  [81] ""            ""            ""            ""            ""           
##  [86] ""            ""            ""            "C23 C25 C27" ""           
##  [91] ""            ""            "E31"         ""            ""           
##  [96] ""            "A5"          "D10 D12"     ""            ""

#Substituindo espaços vazios por "U"
data.combined[which(data.combined$Cabin == ""), "Cabin"] <- "U"
data.combined$Cabin[1:100]

##   [1] "U"           "C85"         "U"           "C123"        "U"          
##   [6] "U"           "E46"         "U"           "U"           "U"          
##  [11] "G6"          "C103"        "U"           "U"           "U"          
##  [16] "U"           "U"           "U"           "U"           "U"          
##  [21] "U"           "D56"         "U"           "A6"          "U"          
##  [26] "U"           "U"           "C23 C25 C27" "U"           "U"          
##  [31] "U"           "B78"         "U"           "U"           "U"          
##  [36] "U"           "U"           "U"           "U"           "U"          
##  [41] "U"           "U"           "U"           "U"           "U"          
##  [46] "U"           "U"           "U"           "U"           "U"          
##  [51] "U"           "U"           "D33"         "U"           "B30"        
##  [56] "C52"         "U"           "U"           "U"           "U"          
##  [61] "U"           "B28"         "C83"         "U"           "U"          
##  [66] "U"           "F33"         "U"           "U"           "U"          
##  [71] "U"           "U"           "U"           "U"           "U"          
##  [76] "F G73"       "U"           "U"           "U"           "U"          
##  [81] "U"           "U"           "U"           "U"           "U"          
##  [86] "U"           "U"           "U"           "C23 C25 C27" "U"          
##  [91] "U"           "U"           "E31"         "U"           "U"          
##  [96] "U"           "A5"          "D10 D12"     "U"           "U"

#Examinando primeira letra da cabine como fator
Cabin.first.char <- as.factor(substr(data.combined$Cabin, 1, 1))
str(Cabin.first.char)

##  Factor w/ 9 levels "A","B","C","D",..: 9 3 9 3 9 9 5 9 9 9 ...

levels(Cabin.first.char)

## [1] "A" "B" "C" "D" "E" "F" "G" "T" "U"

#Adicionando ao Dataset e gerando gráficos
data.combined$Cabin.first.char <- Cabin.first.char

#Overview
ggplot(data.combined[1:891,], aes(Cabin.first.char, fill=Survived)) +
  geom_bar() +
  ggtitle("Sobrevivência por Cabine") +
  xlab("Cabines - primeira letra") +
  ylab("Contagem") +
  ylim(0,750) +
  labs(fill = "Survived")

#Overview, por classes
ggplot(data.combined[1:891,], aes(Cabin.first.char, fill=Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) +
  ggtitle("Sobrevivência por Cabine") +
  xlab("Cabines - primeira letra") +
  ylab("Contagem") +
  ylim(0,750) +
  labs(fill = "Survived")

#Overview, por classes e titulos
ggplot(data.combined[1:891,], aes(Cabin.first.char, fill=Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + Title) +
  ggtitle("Sobrevivência por Cabine") +
  xlab("Cabines - primeira letra") +
  ylab("Contagem") +
  ylim(0,750) +
  labs(fill = "Survived")

#Passageiros com multiplas cabines
data.combined$Cabin.multiple <- as.factor((ifelse(str_detect(data.combined$Cabin, " "), "Y", "N")))

ggplot(data.combined[1:891,], aes(x = Cabin.multiple, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + Title) +
  ggtitle("Pclass, Title") +
  xlab("cabin.multiple") +
  ylab("Total Count") +
  ylim(0,350) +
  labs(fill = "Survived")

#Existe relação entre o local de embarke e sobrevivência
ggplot(data.combined[1:891,], aes(Embarked, fill=Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + Title) +
  ggtitle("Sobrevivência por local de Embarque") +
  xlab("Local de Embarque") +
  ylab("Contagem") +
  ylim(0,300) +
  labs(fill = "Survived")

#==============================================================================
#
#  Exploratory Modeling
#
#==============================================================================

#install.packages("randomForest", repos="http://cran.rstudio.com/")
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

#Treinando Random Forest com o parametros padrões e usando Pclass e Title
rf.train.1 <- data.combined[1:891, c("Pclass", "Title")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000)
rf.1

## 
## Call:
##  randomForest(x = rf.train.1, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 20.76%
## Confusion matrix:
##     0   1 class.error
## 0 538  11  0.02003643
## 1 174 168  0.50877193

varImpPlot(rf.1)

#Treinando Random Forest com o parametros padrões e usando Pclass, Title e SibSp
rf.train.2 <- data.combined[1:891, c("Pclass", "Title", "SibSp")]

set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000)
rf.2

## 
## Call:
##  randomForest(x = rf.train.2, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 19.75%
## Confusion matrix:
##     0   1 class.error
## 0 487  62   0.1129326
## 1 114 228   0.3333333

varImpPlot(rf.2)

#Treinando Random Forest com o parametros padrões e usando Pclass e Parch
rf.train.3 <- data.combined[1:891, c("Pclass", "Title", "Parch")]

set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3

## 
## Call:
##  randomForest(x = rf.train.3, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 20.09%
## Confusion matrix:
##     0   1 class.error
## 0 503  46  0.08378871
## 1 133 209  0.38888889

varImpPlot(rf.3)

#Treinando Random Forest com o parametros padrões e usando Pclass, SibSP, e Parch
rf.train.4 <- data.combined[1:891, c("Pclass", "Title", "SibSp", "Parch")]

set.seed(1234)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4

## 
## Call:
##  randomForest(x = rf.train.4, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 18.18%
## Confusion matrix:
##     0   1 class.error
## 0 487  62   0.1129326
## 1 100 242   0.2923977

varImpPlot(rf.4)

#Treinando Random Forest com o parametros padrões e usando Pclass, Title e Family.size
rf.train.5 <- data.combined[1:891, c("Pclass", "Title", "Family.size")]

set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5

## 
## Call:
##  randomForest(x = rf.train.5, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 18.41%
## Confusion matrix:
##     0   1 class.error
## 0 485  64   0.1165756
## 1 100 242   0.2923977

varImpPlot(rf.5)

#Treinando Random Forest com o parametros padrões e usando Pclass, Title, Parch e Family.size
rf.train.6 <- data.combined[1:891, c("Pclass", "Title", "Parch", "Family.size")]

set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6

## 
## Call:
##  randomForest(x = rf.train.6, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 19.42%
## Confusion matrix:
##     0   1 class.error
## 0 484  65   0.1183971
## 1 108 234   0.3157895

varImpPlot(rf.6)

#########################################
#
#   ### Validação Cruzada ###
#
#########################################

#Criando novo subset
test.submit.df <- data.combined[892:1309, c("Pclass", "Title", "Family.size")]
View(test.submit.df)

#Fazendo predições
rf.5.preds <- predict(rf.5, test.submit.df)
table(rf.5.preds)

## rf.5.preds
##   0   1 
## 258 160

#Criando o arquivo CSV para enviar ao Kaagle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.5.preds)

write.csv(submit.df, file = "RF_SB_TIT_20160714.csv", row.names = FALSE)


# Nossa pontuação ficou em 0.79, vamos melhora-la


library(caret)

## Loading required package: lattice

library(doSNOW)

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: snow

set.seed(2348)
cv.10.folds <- createMultiFolds(rf.label, k = 10, times = 10)

# Checando estratificação
table(rf.label)

## rf.label
##   0   1 
## 549 342

342/549

## [1] 0.6229508

table(rf.label[cv.10.folds[[33]]])

## 
##   0   1 
## 494 308

308/494

## [1] 0.6234818

titanic.R

César

Wed Sep 07 09:02:08 2016