R-code-titanic-prediction

I learnt this from various sites starting from R datacamp, kaggle website and some of the blogs which I read on how this problem could be done using simple classification to random forest..

First Kraggle competition on Machine Learning using Titanic Disaster.Got the train.csv and test.csv files from kaggle website

#train_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
#train <- read.csv(train_url)
# Import the testing set: test
#test_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
#test <- read.csv(test_url)
train_data<- read.csv("~/Google Drive/Kaggle/Titanic-Disaster/train.csv")
str(train_data)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

Passengers survived vs passengers passed away ; with respect to Female and Male

table(train_data$Survived)

## 
##   0   1 
## 549 342

#As proportions
prop.table(table(train_data$Survived))

## 
##         0         1 
## 0.6161616 0.3838384

table(train_data$Sex,train_data$Survived)

##         
##            0   1
##   female  81 233
##   male   468 109

# As row-wise proportions
prop.table(table(train_data$Sex,train_data$Survived),margin=1)

##         
##                  0         1
##   female 0.2579618 0.7420382
##   male   0.8110919 0.1889081

# 74% females survived
# only 19% men survived

Create the column child, and indicate whether child or no child

train_data$Child <-0
train_data$Child[train_data$Age<18] <- 1
# Two-way comparison
prop.table(table(train_data$Child,train_data$Survived),1)

##    
##             0         1
##   0 0.6388175 0.3611825
##   1 0.4601770 0.5398230

# 54% adult survived
# only 38% child survived

So we could do 2 predictions for the test data; for that copy the test_data to test_one and test_two

test_data<- read.csv("~/Google Drive/Kaggle/Titanic-Disaster/test.csv")

PREDICTION 1

test_one <-test_data
# Initialize a Survived column to 0
test_one$Survived <- 0

# Set Survived to 1 if Sex equals "female"
test_one$Survived[test_one$Sex =="female"]<-1

predict1<-data.frame(test_one$PassengerId,test_one$Survived)
names(predict1)<-c("PassengerId","Survived")
rownames(predict1)<-NULL
write.csv(predict1,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict1.csv",row.names=FALSE)

PREDICTION 2

test_two <-test_data
# Initialize a Survived column to 0
test_two$Survived <- 0

# Create the column child, and indicate whether child or no child
test_two$Child <-0
test_two$Child[test_two$Age<18] <- 1

summary(train_data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.42   20.12   28.00   29.70   38.00   80.00     177

summary(train_data$Sex)

## female   male 
##    314    577

prop.table(table(train_data$Sex, train_data$Survived))

##         
##                   0          1
##   female 0.09090909 0.26150393
##   male   0.52525253 0.12233446

prop.table(table(train_data$Sex, train_data$Survived),1)

##         
##                  0         1
##   female 0.2579618 0.7420382
##   male   0.8110919 0.1889081

# Set Survived to 1 if Sex equals "female" and age==0
test_two$Survived[test_two$Sex =="female" & test_two$Child== 0]<-1

predict2<-data.frame(test_two$PassengerId,test_two$Survived)
names(predict2)<-c("PassengerId","Survived")
rownames(predict2)<-NULL
write.csv(predict2,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict2.csv",row.names=FALSE)

#Score 0.78469
#level : 1796

PREDICTION 3

#Another one more prediction taking more variables into account

aggregate(Survived~Child + Sex,data=train_data,FUN=sum)

##   Child    Sex Survived
## 1     0 female      195
## 2     1 female       38
## 3     0   male       86
## 4     1   male       23

aggregate(Survived~Child + Sex,data=train_data,FUN=length)

##   Child    Sex Survived
## 1     0 female      259
## 2     1 female       55
## 3     0   male      519
## 4     1   male       58

aggregate(Survived ~ Child + Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})

##   Child    Sex  Survived
## 1     0 female 0.7528958
## 2     1 female 0.6909091
## 3     0   male 0.1657033
## 4     1   male 0.3965517

#While the class variable is limited to a manageable 3 values, the fare is 
#again a continuous variable that needs to be reduced to something that can 
#be easily tabulated. Let’s bin the fares into less than $10, between 
#$10 and $20, $20 to $30 and more than $30 and store it to a new variable:

train_data3 <-train_data  
train_data3$Fare2 <- '30+'
train_data3$Fare2[train_data3$Fare < 30 & train_data3$Fare >= 20] <- '20-30'
train_data3$Fare2[train_data3$Fare < 20 & train_data3$Fare >= 10] <- '10-20'
train_data3$Fare2[train_data3$Fare < 10] <- '<10'
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train_data3, FUN=function(x) {sum(x)/length(x)})

##    Fare2 Pclass    Sex  Survived
## 1  20-30      1 female 0.8333333
## 2    30+      1 female 0.9772727
## 3  10-20      2 female 0.9142857
## 4  20-30      2 female 0.9000000
## 5    30+      2 female 1.0000000
## 6    <10      3 female 0.5937500
## 7  10-20      3 female 0.5813953
## 8  20-30      3 female 0.3333333
## 9    30+      3 female 0.1250000
## 10   <10      1   male 0.0000000
## 11 20-30      1   male 0.4000000
## 12   30+      1   male 0.3837209
## 13   <10      2   male 0.0000000
## 14 10-20      2   male 0.1587302
## 15 20-30      2   male 0.1600000
## 16   30+      2   male 0.2142857
## 17   <10      3   male 0.1115385
## 18 10-20      3   male 0.2368421
## 19 20-30      3   male 0.1250000
## 20   30+      3   male 0.2400000

#Men in comparison with Women are doing worse irrespective of the class and the
#amount they paid. Women who were in class 3 and having expensive tickets 
#are failed to get into boat. So lets use this result for the new prediction 
#in our test data

test_three <-test_data
test_three$Survived <- 0
test_three$Survived[test_data$Sex == 'female'] <- 1
test_three$Survived[test_data$Sex == 'female' & test_data$Pclass == 3 & test_data$Fare >= 20] <- 0

predict3<-data.frame(test_three$PassengerId,test_three$Survived)
names(predict3)<-c("PassengerId","Survived")
rownames(predict3)<-NULL
write.csv(predict3,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict3.csv",row.names=FALSE)

PREDICTION 4

install.packages("rpart")

## 
## The downloaded binary packages are in
##  /var/folders/71/vnhmr1ts6w354s54vd5n1xw80000gn/T//RtmpoXDLSE/downloaded_packages

library(rpart)
my_tree<-rpart(Survived ~ Sex + Age, data = train_data, method = "class")
#plot(my_tree)
#text(my_tree)


#library(rattle) I had lot of trouble installing this... I get this working in the console but when i do knit it gives me problem..So commented it out..

library(rpart.plot)
library(RColorBrewer)
new.fit <- prp(my_tree,snip=TRUE)$obj

## Warning: ignoring snip=TRUE for quartz_off_screen device

# Time to plot your fancy tree
#fancyRpartPlot(my_tree)


# Make your prediction using the test set
my_prediction <- predict(my_tree,test_data,type="class")

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction)

# Check that the data frame has 418 entries
nrow(my_solution)

## [1] 418

# Write the solution to a csv file with the name my_solution.csv
write.csv(my_solution,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree1.csv",row.names=FALSE)

PREDCTION 5

##Another decision tree prediction

library(rpart)
my_tree2<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train_data, method = "class")
#plot(my_tree2)
#text(my_tree2)
new.fit <- prp(my_tree2,snip=TRUE)$obj

## Warning: ignoring snip=TRUE for quartz_off_screen device

#library(rattle)
#library(rpart.plot)
library(RColorBrewer)

# plot the tree
#fancyRpartPlot(my_tree2)


# Make prediction using the test set
my_prediction2 <- predict(my_tree2,test_data,type="class")

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution2 <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction2)

# Check that the data frame has 418 entries
nrow(my_solution2)

## [1] 418

# Write the solution to a csv file with the name my_solution.csv
write.csv(my_solution2,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree2.csv",row.names=FALSE)

PREDICTION 6

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=train_data,
             method="class", control=rpart.control( minsplit=2, cp=0 ))
new.fit <- prp(fit,snip=TRUE)$obj

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

## Warning: ignoring snip=TRUE for quartz_off_screen device

#fancyRpartPlot(new.fit)

Some cleaning of data before we get on with this prediction

#Feature Engineering (Playing around with variables to squeeze some insights)
#Wrangling of data

train_data$Name[1]

## [1] Braund, Mr. Owen Harris
## 891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo

# To find the number of people with different titles, we add the train and test data 
#together
test_data$Child <-0
test_data$Child[test_data$Age<18] <- 1

test_data$Survived <- NA   # Test_data did not have the Survived column 
combi <- rbind(train_data,test_data)

#names are coded as factors.. Lets make this a character

combi$Name<- as.character(combi$Name)
# Now lets split the string so that we can take out the titles seperately

strsplit(combi$Name[1], split='[,.]')  # This is one list

## [[1]]
## [1] "Braund"       " Mr"          " Owen Harris"

# How do we get the list of list ie. 2nd element of the list - use double square
#bractkets
#[1] "Braund"       " Mr"          " Owen Harris"
strsplit(combi$Name[1],split='[,.]')[[1]][2] #this gives the 2nd part of the first

## [1] " Mr"

# string
#[1] " Mr"

#Now we need to apply this for all records.. That we can do using apply/sapply commands

combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})

#For some of the data, there is a space in front.. so removing it

combi$Title <- sub(' ', '', combi$Title)

#some of the titles have similar meaning but they are written differently. So 
#lets combine them using the "in" command

table(combi$Title)

## 
##         Capt          Col          Don         Dona           Dr 
##            1            4            1            1            8 
##     Jonkheer         Lady        Major       Master         Miss 
##            1            1            2           61          260 
##         Mlle          Mme           Mr          Mrs           Ms 
##            2            1          757          197            2 
##          Rev          Sir the Countess 
##            8            1            1

#Capt         Col         Don        Dona          Dr    Jonkheer        Lady 
#1           4           1           1           8           1           1 
#Major      Master        Miss        Mlle         Mme          Mr         Mrs 
#2          61         260           2           1         757         197 
#Ms         Rev         Sir theCountess 
#2           8           1           1 

combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'theCountess', 'Jonkheer')] <- 'Lady'

#Now we convert this back to factors as we need to use this factor idea in our tree

combi$Title <- factor(combi$Title)

#Now lets look at other variables related to a family size. It will be the addition
#of number of siblings + + his parents+ 1(for the person survived)

combi$FamilySize <- combi$SibSp + combi$Parch + 1

table(combi$FamilySize)

## 
##   1   2   3   4   5   6   7   8  11 
## 790 235 159  43  22  25  16   8  11

#In order to make sure we have made the Family size correctly, we will also check
#the surname of each Family 

combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})

#We then want to append the FamilySize variable to the front of it, but as we saw 
#with factors, string operations need strings. So let’s convert the FamilySize 
#variable temporarily to a string and combine it with the Surname to get our 
#new FamilyID variable:

combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")

#Lets move the family size <2 to "small"

combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
#This did not do the job exactly as required

table(combi$FamilyID)

## 
##            11Sage           3Abbott         3Appleton         3Beckwith 
##                11                 3                 1                 2 
##           3Boulos           3Bourke            3Brown         3Caldwell 
##                 3                 3                 4                 3 
##          3Christy          3Collyer          3Compton          3Cornell 
##                 2                 3                 3                 1 
##           3Coutts           3Crosby           3Danbom           3Davies 
##                 3                 3                 3                 5 
##            3Dodge          3Douglas             3Drew            3Elias 
##                 3                 1                 3                 3 
##       3Frauenthal        3Frolicher 3Frolicher-Stehli        3Goldsmith 
##                 1                 1                 2                 3 
##       3Gustafsson       3Hamalainen           3Hansen             3Hart 
##                 2                 2                 1                 3 
##             3Hays          3Hickman         3Hiltunen         3Hirvonen 
##                 2                 3                 1                 1 
##         3Jefferys          3Johnson             3Kink    3Kink-Heilmann 
##                 2                 3                 2                 2 
##           3Klasen         3Lahtinen           3Mallet            3McCoy 
##                 3                 2                 3                 3 
##          3Minahan         3Moubarek            3Nakid         3Navratil 
##                 1                 3                 3                 3 
##           3Newell           3Newsom         3Nicholls          3Peacock 
##                 1                 1                 1                 3 
##            3Peter            3Quick         3Richards          3Rosblom 
##                 3                 3                 2                 3 
##           3Samaan        3Sandstrom           3Silven          3Spedden 
##                 3                 3                 1                 3 
##            3Strom          3Taussig           3Thayer           3Thomas 
##                 1                 3                 3                 1 
##            3Touma     3van Billiard         3Van Impe    3Vander Planke 
##                 3                 3                 3                 2 
##            3Wells             3Wick          3Widener          4Allison 
##                 3                 3                 3                 4 
##        4Backstrom          4Baclini           4Becker           4Carter 
##                 1                 4                 4                 4 
##         4Davidson             4Dean           4Herman          4Hocking 
##                 1                 4                 4                 2 
##        4Jacobsohn         4Johnston          4Laroche           4Renouf 
##                 1                 4                 4                 1 
##    4Vander Planke             4West             5Ford          5Hocking 
##                 1                 4                 5                 1 
##    5Kink-Heilmann          5Lefebre          5Palsson          5Ryerson 
##                 1                 5                 5                 5 
##          6Fortune           6Panula             6Rice         6Richards 
##                 6                 6                 6                 1 
##            6Skoog        7Andersson          7Asplund          8Goodwin 
##                 6                 9                 7                 8 
##             Small 
##              1025

#Writing it in data.frame
famIDs <- data.frame(table(combi$FamilyID))

famIDs<- famIDs[famIDs$Freq <=2,]
#Now move this 

combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)

#Now we have the variables required for feature engineering and so we need to split back
#the set to train and test data

train_data <- combi[1:891,]
test_data <- combi[892:1309,]

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + 
                 FamilySize + FamilyID,
             data=train_data, method="class")
new.fit <- prp(fit,snip=TRUE)$obj

## Warning: ignoring snip=TRUE for quartz_off_screen device

# Time to plot the fancy tree
#fancyRpartPlot(fit)

# Make your prediction using the test set
my_prediction3 <- predict(fit,test_data,type="class")

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution3 <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction3)

# Check that your data frame has 418 entries
nrow(my_solution3)

## [1] 418

# Write your solution to a csv file with the name my_solution.csv
write.csv(my_solution3,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree3.csv",row.names=FALSE)

PREDICTION 7

#Random Forrest

#Lets make several small trees 

fit1 <- rpart(Survived ~ Pclass,data=train_data, method="class")
#fancyRpartPlot(fit1)
aggregate(Survived ~ Pclass+Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})

##   Pclass    Sex  Survived
## 1      1 female 0.9680851
## 2      2 female 0.9210526
## 3      3 female 0.5000000
## 4      1   male 0.3688525
## 5      2   male 0.1574074
## 6      3   male 0.1354467

fit2 <- rpart(Survived ~ Sex,data=train_data, method="class")
#fancyRpartPlot(fit2)
aggregate(Survived ~ Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})

##      Sex  Survived
## 1 female 0.7420382
## 2   male 0.1889081

fit3 <- rpart(Survived ~ Embarked,data=train_data, method="class")
#fancyRpartPlot(fit3)

#############################
#Step 1 : Cleaning the data
#############################
summary(combi$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

#Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#   0.17   21.00   28.00   29.88   39.00   80.00     263 

#We have 263 NA's in this age variable column
#to fill the columns with age, we use decision tree to predict Age w.r.to all the available variables

Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
                data=combi[!is.na(combi$Age),], method="anova")

# use this tree to predict the missing age

combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
summary(combi$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.17   22.00   28.86   29.70   36.50   80.00

#Now Age looks fine and lets find whether there are any other variables having similar problem

summary(combi$Embarked)

##       C   Q   S 
##   2 270 123 914

#C   Q   S 
#2 270 123 914 
# 2 people have missing data about where they boarded. But as most of them embarked at Southampton(S), we can
# add S to both persons. 
which(combi$Embarked == '')

## [1]  62 830

# 62 830

combi$Embarked[c(62,830)] = "S"
combi$Embarked <- factor(combi$Embarked)
summary(combi$Embarked)

##   C   Q   S 
## 270 123 916

#  C   Q   S 
#270 123 916 

#Now lets look at the Fare variable
summary(combi$Fare)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.896  14.450  33.300  31.280 512.300       1

#Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#  0.000   7.896  14.450  33.300  31.280 512.300       1 

#One data is missing. Lets add the average to this..

which(is.na(combi$Fare))

## [1] 1044

#1044

combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

#Step 2 - 
#Lot to do... Still understanding from blogs and will be using in some available data