The RMS Titanic was a British liner that sank on April 15th 1912 during her maiden voyage. This report analyzes the Titanic data for 1309 passengers and crews to determine how passengers’ survival depended on other measured variables in the dataset.

# Load data
titanic <- read.csv("Titanic_data.csv",head=TRUE,sep=",")
str(titanic)
## 'data.frame':    1309 obs. of  13 variables:
##  $ Life_boat: Factor w/ 21 levels "","1","10","11",..: 1 1 1 1 1 20 1 7 6 20 ...
##  $ survived : int  0 0 0 0 0 1 0 1 1 1 ...
##  $ pclass   : int  1 1 1 1 3 3 3 3 3 3 ...
##  $ name     : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 547 871 618 203 729 123 724 684 400 679 ...
##  $ sex      : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age      : num  55 64 46 33 21 32 28 32 32 38 ...
##  $ sibsp    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ parch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ticket   : Factor w/ 930 levels "110152","110413",..: 685 686 687 688 82 102 102 102 102 102 ...
##  $ fare     : num  50 26 26 5 7.88 ...
##  $ cabin    : Factor w/ 190 levels "","[D] O135_",..: 89 1 1 49 1 1 1 1 1 1 ...
##  $ embarked : Factor w/ 4 levels "B","C","Q","S": 4 4 4 4 4 4 4 4 4 4 ...
##  $ train    : int  0 1 0 1 0 1 1 1 1 1 ...

View the first 5 rows of the Titanic dataset:

titanic[1:5,]
##   Life_boat survived pclass                         name  sex age sibsp
## 1                  0      1  Hipkins, Mr. William Edward male  55     0
## 2                  0      1 Nicholson, Mr. Arthur Ernest male  64     0
## 3                  0      1   Jones, Mr. Charles Cresson male  46     0
## 4                  0      1     Carlsson, Mr. Frans Olof male  33     0
## 5                  0      3   Lockyer, Mr. Edward Thomas male  21     0
##   parch ticket    fare       cabin embarked train
## 1     0    680 50.0000         C39        S     0
## 2     0    693 26.0000                    S     1
## 3     0    694 26.0000                    S     0
## 4     0    695  5.0000 B51 B53 B55        S     1
## 5     0   1222  7.8792                    S     0

Part 2: Data munging

Transform passenger names from factor variables to character variables:

titanic$name <- as.character(titanic$name)

Transform survived and passenger class from integers to factor variables:

titanic$survived <- as.factor(titanic$survived)
titanic$pclass <- as.factor(titanic$pclass)

Part 3: Provide data summary

3a) Create a 3-way table for the variables survived (passengers’ survival; 0=No, 1=Yes), pclass (passenger class; 1=Upper class, 2=Middle class, 3=Lower class) and sex (male and female).

titanicTab <- xtabs(~survived + pclass + sex, titanic)
ftable(titanicTab) 
##                 sex female male
## survived pclass                
## 0        1               5  118
##          2              12  146
##          3             110  417
## 1        1             139   62
##          2              94   24
##          3             106   76

3b) Display the five-number summaries and histogram plots for age (passengers’ age) and fare (passengers’ fare in pounds):

fivenum(titanic$age, na.rm=T)
## [1]  0.17 21.00 28.00 38.00 74.00
fivenum(titanic$fare, na.rm=T)
## [1]   0.0000   7.8958  14.4542  31.2750 512.3292
library(ggplot2)
library(gridExtra)
plot1 <- qplot(titanic$age, xlab="Passenger Age (in years)")
plot2 <- qplot(titanic$fare, xlab="Passenger Fare (in pounds)")
grid.arrange(plot1, plot2, ncol=2)

3c) Display the frequency tables for sibsp (number of siblings or spouses aboard Titanic) and embarked (port of embarkation; C=Cherbourg, Q=Queenstown, S=Southampton).

sibspTab <- table(titanic$sibsp)
sibspTab
## 
##   0   1   2   3   4   5   8 
## 887 324  45  18  20   6   9
embarkedTab <- table(titanic$embarked)
embarkedTab
## 
##   B   C   Q   S 
##  10 267 123 909

Inspect the data for unusual patterns.

3di) 17 passengers travelled on zero fare.

summary(titanic$fare==0)
##    Mode   FALSE    TRUE    NA's 
## logical    1291      17       1

3dii) 4 passengers (Ms. Ward, Mr and Mrs Cardeza, and Mr Lesurer) paid the most expensive fare at 512.33 pounds.

# maximum fare
expFare <- max(titanic$fare, na.rm=T)
expFare
## [1] 512.3292
# number of passengers that paid max fare
(1:1309)[!is.na(titanic$fare) & titanic$fare == expFare]
## [1] 1165 1166 1167 1168
# names of the 4 passengers
a = 1;
expName <-array(0, dim=c(4,1))
for (i in 1 : 1309){
  if (!is.na(titanic$fare[i]) & titanic$fare[i] == expFare){
    expName[a] = titanic$name[i];
    a = a + 1;
  }
}
expName
##      [,1]                                                             
## [1,] "Ward, Miss. Anna"                                               
## [2,] "Cardeza, Mr. Thomas Drake Martinez"                             
## [3,] "Lesurer, Mr. Gustave J"                                         
## [4,] "Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)"

3diii) 33 passengers had at least 6 family members on board, with their names displayed below.

length((1:1309)[titanic$sibsp + titanic$parch >= 6])
## [1] 33
# names of the 33 passengers with at least 6 other family members on Titanic
bigFam <- array(0, dim=c(33,1))
a = 1;
for (i in 1 : 1309){
  if (titanic$sibsp[i] + titanic$parch[i] > 5){
    bigFam[a] = titanic$name[i];
    a = a + 1;
  }
}
bigFam
##       [,1]                                                       
##  [1,] "Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)"
##  [2,] "Asplund, Master. Clarence Gustaf Hugo"                    
##  [3,] "Asplund, Miss. Lillian Gertrud"                           
##  [4,] "Asplund, Master. Edvin Rojj Felix"                        
##  [5,] "Asplund, Master. Filip Oscar"                             
##  [6,] "Asplund, Mr. Carl Oscar Vilhelm Gustafsson"               
##  [7,] "Asplund, Master. Carl Edgar"                              
##  [8,] "Andersson, Mr. Anders Johan"                              
##  [9,] "Andersson, Miss. Ellis Anna Maria"                        
## [10,] "Andersson, Miss. Ingeborg Constanzia"                     
## [11,] "Andersson, Miss. Sigrid Elisabeth"                        
## [12,] "Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)"
## [13,] "Andersson, Miss. Ebba Iris Alfrida"                       
## [14,] "Andersson, Master. Sigvard Harald Elias"                  
## [15,] "Goodwin, Master. William Frederick"                       
## [16,] "Goodwin, Miss. Lillian Amy"                               
## [17,] "Goodwin, Master. Sidney Leonard"                          
## [18,] "Goodwin, Master. Harold Victor"                           
## [19,] "Goodwin, Mrs. Frederick (Augusta Tyler)"                  
## [20,] "Goodwin, Mr. Charles Edward"                              
## [21,] "Goodwin, Mr. Charles Frederick"                           
## [22,] "Goodwin, Miss. Jessie Allis"                              
## [23,] "Sage, Master. Thomas Henry"                               
## [24,] "Sage, Miss. Constance Gladys"                             
## [25,] "Sage, Mr. Frederick"                                      
## [26,] "Sage, Mr. George John Jr"                                 
## [27,] "Sage, Miss. Stella Anne"                                  
## [28,] "Sage, Mr. Douglas Bullen"                                 
## [29,] "Sage, Miss. Dorothy Edith \"Dolly\""                      
## [30,] "Sage, Miss. Ada"                                          
## [31,] "Sage, Mr. John George"                                    
## [32,] "Master Anthony William Sage "                             
## [33,] "Sage, Mrs. John (Annie Bullen)"

3div) There were some big travel groups where at least 6 passengers shared the same boarding ticket. Among the 930 boarding tickets, 12 tickets had at least 6 passengers sharing the same ticket, with the 86 passengers’ names displayed below.

# number of unique tickets
length(unique(titanic$ticket))
## [1] 930
# number of tickets (bottom row) shared by number of passengers (top row)
table(table(titanic$ticket))
## 
##   1   2   3   4   5   6   7   8  11 
## 715 131  49  16   7   4   5   2   1
# names of the 12 tickets that had at least 6 passengers sharing it
names(table(titanic$ticket))[table(titanic$ticket) > 5]
##  [1] "113781"       "1601"         "19950"        "3101295"     
##  [5] "347077"       "347082"       "347088"       "382652"      
##  [9] "CA 2144"      "CA. 2343"     "PC 17608"     "S.O.C. 14879"
# names of the 86 passengers who shared the tickets:
shareTix <- titanic[titanic$ticket %in% names(table(titanic$ticket))[table(titanic$ticket) > 5],]
shareTix[order(shareTix$ticket),]$name
##  [1] "Allison, Miss. Helen Loraine"                             
##  [2] "Allison, Master. Hudson Trevor"                           
##  [3] "Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"          
##  [4] "Cleaver, Miss. Alice"                                     
##  [5] "Daniels, Miss. Sarah"                                     
##  [6] "Allison, Mr. Hudson Joshua Creighton"                     
##  [7] "Bing, Mr. Lee"                                            
##  [8] "Ling, Mr. Lee"                                            
##  [9] "Lang, Mr. Fang"                                           
## [10] "Foo, Mr. Choong"                                          
## [11] "Lam, Mr. Ali"                                             
## [12] "Lam, Mr. Len"                                             
## [13] "Chip, Mr. Chang"                                          
## [14] "Hee, Mr. Ling"                                            
## [15] "Fortune, Mr. Charles Alexander"                           
## [16] "Fortune, Miss. Mabel Helen"                               
## [17] "Fortune, Miss. Alice Elizabeth"                           
## [18] "Fortune, Mr. Mark"                                        
## [19] "Fortune, Miss. Ethel Flora"                               
## [20] "Fortune, Mrs. Mark (Mary McDougald)"                      
## [21] "Panula, Master. Juha Niilo"                               
## [22] "Panula, Master. Eino Viljami"                             
## [23] "Panula, Mr. Ernesti Arvid"                                
## [24] "Panula, Mrs. Juha (Maria Emilia Ojala)"                   
## [25] "Panula, Mr. Jaako Arnold"                                 
## [26] "Panula, Master. Urho Abraham"                             
## [27] "Riihivouri, Miss. Susanna Juhantytar Sanni"               
## [28] "Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)"
## [29] "Asplund, Master. Clarence Gustaf Hugo"                    
## [30] "Asplund, Miss. Lillian Gertrud"                           
## [31] "Asplund, Master. Edvin Rojj Felix"                        
## [32] "Asplund, Master. Filip Oscar"                             
## [33] "Asplund, Mr. Carl Oscar Vilhelm Gustafsson"               
## [34] "Asplund, Master. Carl Edgar"                              
## [35] "Andersson, Mr. Anders Johan"                              
## [36] "Andersson, Miss. Ellis Anna Maria"                        
## [37] "Andersson, Miss. Ingeborg Constanzia"                     
## [38] "Andersson, Miss. Sigrid Elisabeth"                        
## [39] "Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)"
## [40] "Andersson, Miss. Ebba Iris Alfrida"                       
## [41] "Andersson, Master. Sigvard Harald Elias"                  
## [42] "Skoog, Master. Harald"                                    
## [43] "Skoog, Mrs. William (Anna Bernhardina Karlsson)"          
## [44] "Skoog, Mr. Wilhelm"                                       
## [45] "Skoog, Miss. Mabel"                                       
## [46] "Skoog, Miss. Margit Elizabeth"                            
## [47] "Skoog, Master. Karl Thorsten"                             
## [48] "Rice, Master. Eugene"                                     
## [49] "Rice, Master. Arthur"                                     
## [50] "Rice, Master. Eric"                                       
## [51] "Rice, Master. George Hugh"                                
## [52] "Rice, Mrs. William (Margaret Norton)"                     
## [53] "Rice, Master. Albert"                                     
## [54] "Goodwin, Master. William Frederick"                       
## [55] "Goodwin, Miss. Lillian Amy"                               
## [56] "Goodwin, Master. Sidney Leonard"                          
## [57] "Goodwin, Master. Harold Victor"                           
## [58] "Goodwin, Mrs. Frederick (Augusta Tyler)"                  
## [59] "Goodwin, Mr. Charles Edward"                              
## [60] "Goodwin, Mr. Charles Frederick"                           
## [61] "Goodwin, Miss. Jessie Allis"                              
## [62] "Sage, Master. Thomas Henry"                               
## [63] "Sage, Miss. Constance Gladys"                             
## [64] "Sage, Mr. Frederick"                                      
## [65] "Sage, Mr. George John Jr"                                 
## [66] "Sage, Miss. Stella Anne"                                  
## [67] "Sage, Mr. Douglas Bullen"                                 
## [68] "Sage, Miss. Dorothy Edith \"Dolly\""                      
## [69] "Sage, Miss. Ada"                                          
## [70] "Sage, Mr. John George"                                    
## [71] "Master Anthony William Sage "                             
## [72] "Sage, Mrs. John (Annie Bullen)"                           
## [73] "Ryerson, Miss. Emily Borie"                               
## [74] "Ryerson, Miss. Susan Parker \"Suzette\""                  
## [75] "Ryerson, Mrs. Arthur Larned (Emily Maria Borie)"          
## [76] "Chaudanson, Miss. Victorine"                              
## [77] "Ryerson, Master. John Borie"                              
## [78] "Ryerson, Mr. Arthur Larned"                               
## [79] "Bowen, Miss. Grace Scott"                                 
## [80] "Hood, Mr. Ambrose Jr"                                     
## [81] "Hickman, Mr. Stanley George"                              
## [82] "Davies, Mr. Charles Henry"                                
## [83] "Hickman, Mr. Leonard Mark"                                
## [84] "Hickman, Mr. Lewis"                                       
## [85] "Deacon, Mr. Percy William"                                
## [86] "Dibden, Mr. William"

3dv) Passenger 784 and 785 had missing values for the variable age. The two passengers were both males that boarded the lower passenger class (pclass=3). The median age for all male passengers in the lower passenger class was 24 years old, and assign it to the two passengers’ age values.

# identify the passengers whose age were NA
titanic[is.na(titanic$age),]
##     Life_boat survived pclass                   name  sex age sibsp parch
## 784                  0      3    Kraeff, Mr. Theodor male  NA     0     0
## 785                  0      3 Gheorgheff, Mr. Stanio male  NA     0     0
##     ticket   fare cabin embarked train
## 784 349253 7.8958              C     1
## 785 349254 7.8958              C     1
# there were 493 male passengers in the lower class
length((1:1309)[(titanic$pclass==3) & (titanic$sex=='male')]) 
## [1] 493
# calculate the median age for all male passengers in the lower class
maleThird <- (1:1309)[(titanic$pclass==3) & (titanic$sex=='male')] 
maleThirdPass <- titanic[maleThird[order(titanic$name[maleThird])], ]
median(maleThirdPass$age,na.rm=T)
## [1] 24
# assign the 2 male passengers to be 24 years old
titanic[784,]$age <- 24
titanic[785,]$age <- 24 

3dvi) Passenger 118 boarded the lower passenger class and had missing value for the variable fare. The median fare of the corresponding class was 8.05 pounds, and assign it to the passenger’s fare value.

Identify the 1 fare that has NA value.

# identify the passengers whose fare was NA
titanic[is.na(titanic$fare),]
##     Life_boat survived pclass               name  sex age sibsp parch
## 118                  0      3 Storey, Mr. Thomas male  51     0     0
##     ticket fare cabin embarked train
## 118   3701   NA              S     0
# there were 709 passengers in the lower class
length((1:1309)[titanic$pclass==3])
## [1] 709
# the median fare of the lower class tickets was 8.05 pounds
thirdClassPass <- (1:1309)[titanic$pclass==3]
thirdClassTix <- titanic[thirdClassPass[order(titanic$name[thirdClassPass])], ]
median(thirdClassTix$fare,na.rm=T)
## [1] 8.05
# assign the median fare to the NA fare value
titanic[118,]$fare <- 8.05

Part 4: Logistic Regression

Divide the data into training and test sets, then explore various logistic models.

trainSet <- titanic[titanic$train == 1, ]
testSet <- titanic[titanic$train == 0, ]

4a) Fit a logistic model survived ~ sex + pclass on the training data. The model’s AIC value and BIC value are 834.89 and 854.06 respectively. The reference group in model 4(a) (ie. the intercept) is a female passenger that boarded the upper passenger class who had a 2.297 log-odds of survival ie. her survival odds was 9.944 (\(e^{2.297}\)), and her probability of survival was 0.909 (\(\frac{9.944}{1+9.944}\)). The sexmale coefficient represents the difference in log-odds survival ratio between the male and female passengers, hence the male passengers’ log-odds of survival was -0.345 (2.297-2.642) ie. the survival odds of a male passenger was 0.708 (\(e^{-0.345}\)), and his probability of survival was 0.415 (\(\frac{0.708}{1+0.708}\)).

The pclass2 coefficient represents the difference in log-odds survival ratio between the upper and the middle passenger classes, hence the middle class passengers’ log-odds of survival was 1.459 (2.297-0.838) ie. the survival odds of a middle class passenger was 4.302 (\(e^{1.459}\)), and the probability of survival was 0.811 (\(\frac{4.302}{1+4.302}\)). Likewise, the pclass3 coefficient is the difference in log-odds survival ratio between the upper and the lower passenger classes, thus the lower class passengers’ log-odds of survival was 0.392 (2.297-1.905) ie. the survival odds of a lower class passenger was 1.48 (\(e^{0.392}\)), and the probability of survival was 0.597 (\(\frac{1.48}{1+1.48}\)).

In this model, the female passengers that boarded the upper passenger class had the largest probability of survival. The prediction accuracy of this model is 76.32% (\(\frac{213+106}{418}\)).

model4a <- glm(survived ~ sex + pclass, data = trainSet, family = binomial(link="logit"))
model4a
## 
## Call:  glm(formula = survived ~ sex + pclass, family = binomial(link = "logit"), 
##     data = trainSet)
## 
## Coefficients:
## (Intercept)      sexmale      pclass2      pclass3  
##       2.297       -2.642       -0.838       -1.905  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  887 Residual
## Null Deviance:       1187 
## Residual Deviance: 826.9     AIC: 834.9
drop1(model4a,test="Chi")
## Single term deletions
## 
## Model:
## survived ~ sex + pclass
##        Df Deviance     AIC     LRT  Pr(>Chi)    
## <none>      826.89  834.89                      
## sex     1  1083.11 1089.11 256.220 < 2.2e-16 ***
## pclass  2   917.80  921.80  90.916 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# AIC value
extractAIC(model4a)
## [1]   4.0000 834.8884
# BIC value
extractAIC(model4a, k=log(nrow(trainSet)))
## [1]   4.0000 854.0577
# summarize the prediction on the test data by a 2x2 table
model4a.pred <- predict(model4a, newdata = testSet, type="response")
model4a.pred <- as.numeric(model4a.pred > 0.5)
table(model4a.pred, testSet[,2])
##             
## model4a.pred   0   1
##            0 213  53
##            1  46 106
# report the prediction accuracy of model 4a
correctPred1 <- sum(model4a.pred == testSet[,2])
accuracyPred1 <- correctPred1/dim(testSet)[1]
accuracyPred1
## [1] 0.7631579

4b) Fit a logistic model survived ~ sex*pclass on the training data. The model’s AIC value and BIC value are 810.10 and 838.85 respectively. Model 4(a) is nested in model 4(b) and has a higher AIC value than 4(b). The reference group in model 4(b) is a female passenger that boarded the upper passenger class who had a log-odds of survival of 3.412 ie. her odds of survival was 30.326 (\(e^{3.412}\)) and her probability of survival was 0.968 (\(\frac{30.326}{1+30.326}\)).

Because there are interactions between the sex and pclass variables , the effects of both variables only represent the effects for the reference group. We need to include the interaction terms to get the effects for male passengers, hence the log-odds difference in survival between the male upper class passengers and male middle class passengers is -1.141 (-0.956-0.185), and between the male upper class passengers and male lower class passengers is -1.316 (-3.412+2.096). Furthermore, the difference in log-odds of survival between the male and female passengers is -3.949 in the upper passenger class, -4.134 (-3.949-0.185) in the middle passenger class, and -1.853 (-3.949+2.096) in the lower passenger class respectively. Accordingly, the difference in log-odds of survival between male and female passengers is considerably lower in the lower passenger class.

For males in the middle passenger class, their survival odds was 0.187 (\(e^{3.412-3.949-1.141}\)), and their probability of survival was 0.158 (\(\frac{0.187}{1+0.187}\)). For males in the lower passenger class, their survival odds was 0.157 (\(e^{3.412-3.949-1.316}\)), and their probability of survival was 0.136 (\(\frac{0.157}{1+0.157}\)). In this model, the female passengers that boarded the upper passenger class had the largest probability of survival. The prediction accuracy of this model is also 76.32% (\(\frac{213+106}{418}\)), and model 4(b) is preferred to model 4(a) because it has a smaller AIC value.

model4b <- glm(survived ~ sex * pclass, data = trainSet, family = binomial(link="logit"))
model4b
## 
## Call:  glm(formula = survived ~ sex * pclass, family = binomial(link = "logit"), 
##     data = trainSet)
## 
## Coefficients:
##     (Intercept)          sexmale          pclass2          pclass3  
##          3.4122          -3.9494          -0.9555          -3.4122  
## sexmale:pclass2  sexmale:pclass3  
##         -0.1850           2.0958  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  885 Residual
## Null Deviance:       1187 
## Residual Deviance: 798.1     AIC: 810.1
drop1(model4b,test="Chi")
## Single term deletions
## 
## Model:
## survived ~ sex * pclass
##            Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>          798.10 810.10                     
## sex:pclass  2   826.89 834.89 28.791 5.598e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# AIC value
extractAIC(model4b)
## [1]   6.0000 810.0969
# BIC value
extractAIC(model4b, k=log(nrow(trainSet)))
## [1]   6.000 838.851
# summarize the prediction on the test data by a 2x2 table
model4b.pred <- predict(model4b, newdata = testSet, type="response")
model4b.pred <- as.numeric(model4b.pred > 0.5)
table(model4b.pred, testSet[,2]) 
##             
## model4b.pred   0   1
##            0 213  53
##            1  46 106
# report the prediction accuracy of model 4b
correctPred2 <- sum(model4b.pred == testSet[,2])
accuracyPred2 <- correctPred2/dim(testSet)[1]
accuracyPred2
## [1] 0.7631579

4c) Add age to model 4(b) and consider all interactions. The model 4(c)(1) survived ~ sex*pclass*age has a AIC value and BIC value of 780.48 and 837.99 respectively. The prediction accuracy of this model is 77.03% (\(\frac{226+96}{418}\)).

model4c1 <- glm(survived ~ sex*pclass*age, data = trainSet, family = "binomial")
model4c1
## 
## Call:  glm(formula = survived ~ sex * pclass * age, family = "binomial", 
##     data = trainSet)
## 
## Coefficients:
##         (Intercept)              sexmale              pclass2  
##             1.43183             -0.08017              2.48904  
##             pclass3                  age      sexmale:pclass2  
##            -1.11655              0.06503             -2.97639  
##     sexmale:pclass3          sexmale:age          pclass2:age  
##            -1.19601             -0.11214             -0.11012  
##         pclass3:age  sexmale:pclass2:age  sexmale:pclass3:age  
##            -0.07918              0.05476              0.09003  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  879 Residual
## Null Deviance:       1187 
## Residual Deviance: 756.5     AIC: 780.5
# AIC value
extractAIC(model4c1)
## [1]  12.0000 780.4776
# BIC value
extractAIC(model4c1, k=log(nrow(trainSet)))
## [1]  12.0000 837.9857
# summarize the prediction on the test data by a 2x2 table
model4c1.pred <- predict(model4c1, newdata = testSet, type="response")
model4c1.pred <- as.numeric(model4c1.pred > 0.5)
table(model4c1.pred, testSet[,2])
##              
## model4c1.pred   0   1
##             0 226  63
##             1  33  96
# report the prediction accuracy of model 4c1
correctPred3 <- sum(model4c1.pred == testSet[,2])
accuracyPred3 <- correctPred3/dim(testSet)[1]
accuracyPred3
## [1] 0.7703349

The model 4(c)(2) survived ~ sex + pclass + age + sex:pclass + sex:age + pclass:age is selected after using stepwise selection. It has a AIC value and BIC value of 779.39 and 827.32 respectively. The prediction accuracy of this model is also 77.03% (\(\frac{227+95}{418}\)).

model4c2 <- glm(survived ~ sex + pclass + age + sex:pclass + sex:age + pclass:age, data = trainSet, family="binomial")
model4c2
## 
## Call:  glm(formula = survived ~ sex + pclass + age + sex:pclass + sex:age + 
##     pclass:age, family = "binomial", data = trainSet)
## 
## Coefficients:
##     (Intercept)          sexmale          pclass2          pclass3  
##        3.456216        -2.375584         0.887344        -3.306794  
##             age  sexmale:pclass2  sexmale:pclass3      sexmale:age  
##       -0.001241        -1.248044         1.479656        -0.038977  
##     pclass2:age      pclass3:age  
##       -0.055531        -0.005464  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  881 Residual
## Null Deviance:       1187 
## Residual Deviance: 759.4     AIC: 779.4
# AIC value
extractAIC(model4c2)
## [1]  10.0000 779.3916
# BIC value
extractAIC(model4c2, k=log(nrow(trainSet)))
## [1]  10.000 827.315
# summarize the prediction on the test data by a 2x2 table
model4c2.pred <- predict(model4c2, newdata = testSet, type="response")
model4c2.pred <- as.numeric(model4c2.pred > 0.5)
table(model4c2.pred, testSet[,2])
##              
## model4c2.pred   0   1
##             0 227  64
##             1  32  95
# report the prediction accuracy of model 4c2
correctPred4 <- sum(model4c2.pred == testSet[,2])
accuracyPred4 <- correctPred4/dim(testSet)[1]
accuracyPred4
## [1] 0.7703349

The model 4(c)(3) survived ~ sex*pclass*log(age) has a AIC value and BIC value of 762.71 and 820.22 respectively. The prediction accuracy of this model is 77.75% (\(\frac{238+87}{418}\)). Model 4(c)(3) has the highest accuracy among the three 4(c) models and the lowest AIC and BIC values.

model4c3 <- glm(survived ~ sex*pclass*log(age),data = trainSet, family = "binomial")
model4c3
## 
## Call:  glm(formula = survived ~ sex * pclass * log(age), family = "binomial", 
##     data = trainSet)
## 
## Coefficients:
##              (Intercept)                   sexmale  
##                   -2.319                     7.149  
##                  pclass2                   pclass3  
##                    9.546                     2.803  
##                 log(age)           sexmale:pclass2  
##                    1.780                   -10.568  
##          sexmale:pclass3          sexmale:log(age)  
##                   -7.656                    -3.258  
##         pclass2:log(age)          pclass3:log(age)  
##                   -3.183                    -1.950  
## sexmale:pclass2:log(age)  sexmale:pclass3:log(age)  
##                    2.832                     2.825  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  879 Residual
## Null Deviance:       1187 
## Residual Deviance: 738.7     AIC: 762.7
# AIC value
extractAIC(model4c3)
## [1]  12.000 762.714
# BIC value
extractAIC(model4c3, k=log(nrow(trainSet)))
## [1]  12.0000 820.2221
# summarize the prediction on the test data by a 2x2 table
model4c3.pred <- predict(model4c3, newdata = testSet, type="response")
model4c3.pred <- as.numeric(model4c3.pred > 0.5)
table(model4c3.pred, testSet[,2])
##              
## model4c3.pred   0   1
##             0 238  72
##             1  21  87
# report the prediction accuracy of model 4c3
correctPred5 <- sum(model4c3.pred == testSet[,2])
accuracyPred5 <- correctPred5/dim(testSet)[1]
accuracyPred5
## [1] 0.777512

4d) Add fare, embarked, sibsp, and parch into model 4(c)(3). This model 4(d)(1) has a AIC value and BIC value of 736.70 and 822.96 respectively.

model4d1 <- glm(survived ~ sex*pclass*log(age) + fare + embarked + sibsp + parch, data = trainSet, family="binomial")
model4d1
## 
## Call:  glm(formula = survived ~ sex * pclass * log(age) + fare + embarked + 
##     sibsp + parch, family = "binomial", data = trainSet)
## 
## Coefficients:
##              (Intercept)                   sexmale  
##               -16.015839                  6.879101  
##                  pclass2                   pclass3  
##                 9.654754                  3.487871  
##                 log(age)                      fare  
##                 1.604966                  0.002369  
##                embarkedC                 embarkedQ  
##                14.709855                 14.769666  
##                embarkedS                     sibsp  
##                14.351511                 -0.546037  
##                    parch           sexmale:pclass2  
##                -0.163529                -10.149199  
##          sexmale:pclass3          sexmale:log(age)  
##                -6.787747                 -3.198238  
##         pclass2:log(age)          pclass3:log(age)  
##                -3.133153                 -2.123169  
## sexmale:pclass2:log(age)  sexmale:pclass3:log(age)  
##                 2.674810                  2.521739  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  873 Residual
## Null Deviance:       1187 
## Residual Deviance: 700.7     AIC: 736.7
# AIC value
extractAIC(model4d1)
## [1]  18.0000 736.6964
# BIC value
extractAIC(model4d1, k=log(nrow(trainSet)))
## [1]  18.0000 822.9586

After performing the step() with AIC criterion, we derive model 4(d)(2) survived ~ sex + pclass + log(age) + embarked + sibsp + sex:pclass + sex:log(age) + pclass:log(age) + sex:pclass:log(age). The model has a AIC value and BIC value of 734.73 and 811.41 respectively. Its prediction accuracy is 78.71% (\(\frac{220+109}{418}\)).

step(model4d1, k = 2)  
## Start:  AIC=736.7
## survived ~ sex * pclass * log(age) + fare + embarked + sibsp + 
##     parch
## 
##                       Df Deviance    AIC
## - fare                 1   701.55 735.55
## - parch                1   702.26 736.26
## <none>                     700.70 736.70
## - embarked             3   708.84 738.84
## - sex:pclass:log(age)  2   707.31 739.31
## - sibsp                1   720.35 754.35
## 
## Step:  AIC=735.55
## survived ~ sex + pclass + log(age) + embarked + sibsp + parch + 
##     sex:pclass + sex:log(age) + pclass:log(age) + sex:pclass:log(age)
## 
##                       Df Deviance    AIC
## - parch                1   702.73 734.73
## <none>                     701.55 735.55
## - sex:pclass:log(age)  2   708.31 738.31
## - embarked             3   710.77 738.77
## - sibsp                1   720.36 752.36
## 
## Step:  AIC=734.73
## survived ~ sex + pclass + log(age) + embarked + sibsp + sex:pclass + 
##     sex:log(age) + pclass:log(age) + sex:pclass:log(age)
## 
##                       Df Deviance    AIC
## <none>                     702.73 734.73
## - sex:pclass:log(age)  2   709.87 737.87
## - embarked             3   712.23 738.23
## - sibsp                1   727.07 757.07
## 
## Call:  glm(formula = survived ~ sex + pclass + log(age) + embarked + 
##     sibsp + sex:pclass + sex:log(age) + pclass:log(age) + sex:pclass:log(age), 
##     family = "binomial", data = trainSet)
## 
## Coefficients:
##              (Intercept)                   sexmale  
##                 -16.0528                    7.1030  
##                  pclass2                   pclass3  
##                   9.6732                    3.3839  
##                 log(age)                 embarkedC  
##                   1.6472                   14.8044  
##                embarkedQ                 embarkedS  
##                  14.8853                   14.4090  
##                    sibsp           sexmale:pclass2  
##                  -0.5639                  -10.4999  
##          sexmale:pclass3          sexmale:log(age)  
##                  -7.0651                   -3.2768  
##         pclass2:log(age)          pclass3:log(age)  
##                  -3.1977                   -2.1678  
## sexmale:pclass2:log(age)  sexmale:pclass3:log(age)  
##                   2.8105                    2.6447  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  875 Residual
## Null Deviance:       1187 
## Residual Deviance: 702.7     AIC: 734.7
model4d2 <- glm(survived ~ sex + pclass + log(age) + embarked + sibsp + sex:pclass + sex:log(age) + pclass:log(age) + sex:pclass:log(age), 
         data = trainSet, family="binomial")

# AIC value
extractAIC(model4d2)
## [1]  16.0000 734.7276
# BIC value
extractAIC(model4d2, k=log(nrow(trainSet)))
## [1]  16.0000 811.4051
# summarize the prediction on the test data by a 2x2 table
model4d2.pred <- predict(model4d2, newdata = testSet, type="response")
model4d2.pred <- as.numeric(model4d2.pred > 0.5)
table(model4d2.pred, testSet[,2])
##              
## model4d2.pred   0   1
##             0 220  50
##             1  39 109
# report the prediction accuracy of model 4d2
correctPred6 <- sum(model4d2.pred == testSet[,2])
accuracyPred6 <- correctPred6/dim(testSet)[1]
accuracyPred6
## [1] 0.7870813

After performing the step() with BIC criterion, we derive model 4(d)(3) survived ~ sex + pclass + log(age) + sibsp + sex:pclass + sex:log(age). The model has a AIC value and BIC value of 741.41 and 784.54 respectively. Its prediction accuracy is 77.51% (\(\frac{220+104}{418}\)). Model 4(d)(2) is preferred to 4(d)(3) because it has higer prediction accuracy.

step(model4d1, k = log(nrow(trainSet)))
## Start:  AIC=822.96
## survived ~ sex * pclass * log(age) + fare + embarked + sibsp + 
##     parch
## 
##                       Df Deviance    AIC
## - embarked             3   708.84 810.72
## - sex:pclass:log(age)  2   707.31 815.99
## - fare                 1   701.55 817.02
## - parch                1   702.26 817.73
## <none>                     700.70 822.96
## - sibsp                1   720.35 835.82
## 
## Step:  AIC=810.72
## survived ~ sex + pclass + log(age) + fare + sibsp + parch + sex:pclass + 
##     sex:log(age) + pclass:log(age) + sex:pclass:log(age)
## 
##                       Df Deviance    AIC
## - sex:pclass:log(age)  2   715.73 804.03
## - fare                 1   710.77 805.87
## - parch                1   710.99 806.08
## <none>                     708.84 810.72
## - sibsp                1   730.33 825.42
## 
## Step:  AIC=804.03
## survived ~ sex + pclass + log(age) + fare + sibsp + parch + sex:pclass + 
##     sex:log(age) + pclass:log(age)
## 
##                   Df Deviance    AIC
## - pclass:log(age)  2   719.71 794.42
## - fare             1   717.95 799.45
## - parch            1   718.44 799.95
## <none>                 715.73 804.03
## - sex:log(age)     1   724.86 806.37
## - sex:pclass       2   740.82 815.54
## - sibsp            1   737.71 819.22
## 
## Step:  AIC=794.42
## survived ~ sex + pclass + log(age) + fare + sibsp + parch + sex:pclass + 
##     sex:log(age)
## 
##                Df Deviance    AIC
## - fare          1   721.53 789.45
## - parch         1   722.35 790.28
## <none>              719.71 794.42
## - sex:log(age)  1   731.94 799.87
## - sex:pclass    2   742.94 804.07
## - sibsp         1   745.11 813.03
## 
## Step:  AIC=789.45
## survived ~ sex + pclass + log(age) + sibsp + parch + sex:pclass + 
##     sex:log(age)
## 
##                Df Deviance    AIC
## - parch         1   723.41 784.54
## <none>              721.53 789.45
## - sex:log(age)  1   733.88 795.01
## - sex:pclass    2   745.37 799.71
## - sibsp         1   745.52 806.65
## 
## Step:  AIC=784.54
## survived ~ sex + pclass + log(age) + sibsp + sex:pclass + sex:log(age)
## 
##                Df Deviance    AIC
## <none>              723.41 784.54
## - sex:log(age)  1   734.74 789.08
## - sex:pclass    2   748.04 795.59
## - sibsp         1   754.99 809.33
## 
## Call:  glm(formula = survived ~ sex + pclass + log(age) + sibsp + sex:pclass + 
##     sex:log(age), family = "binomial", data = trainSet)
## 
## Coefficients:
##      (Intercept)           sexmale           pclass2           pclass3  
##           5.4756           -0.7115           -1.1085           -3.6815  
##         log(age)             sibsp   sexmale:pclass2   sexmale:pclass3  
##          -0.4681           -0.6099           -0.9492            1.6005  
## sexmale:log(age)  
##          -0.9418  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  882 Residual
## Null Deviance:       1187 
## Residual Deviance: 723.4     AIC: 741.4
model4d3 <- glm(survived ~ sex + pclass + log(age) + sibsp + sex:pclass + sex:log(age), data = trainSet, family="binomial")
model4d3
## 
## Call:  glm(formula = survived ~ sex + pclass + log(age) + sibsp + sex:pclass + 
##     sex:log(age), family = "binomial", data = trainSet)
## 
## Coefficients:
##      (Intercept)           sexmale           pclass2           pclass3  
##           5.4756           -0.7115           -1.1085           -3.6815  
##         log(age)             sibsp   sexmale:pclass2   sexmale:pclass3  
##          -0.4681           -0.6099           -0.9492            1.6005  
## sexmale:log(age)  
##          -0.9418  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  882 Residual
## Null Deviance:       1187 
## Residual Deviance: 723.4     AIC: 741.4
# AIC value
extractAIC(model4d3)
## [1]   9.0000 741.4095
# BIC value
extractAIC(model4d3, k=log(nrow(trainSet)))
## [1]   9.0000 784.5406
# summarize the prediction on the test data by a 2x2 table
model4d3.pred <- predict(model4d3, newdata = testSet, type="response")
model4d3.pred <- as.numeric(model4d3.pred > 0.5)
table(model4d3.pred, testSet[,2])
##              
## model4d3.pred   0   1
##             0 220  55
##             1  39 104
# report the prediction accuracy of model 4d3
correctPred7 <- sum(model4d3.pred == testSet[,2])
accuracyPred7 <- correctPred7/dim(testSet)[1]
accuracyPred7
## [1] 0.7751196

4e) Among all the logistic models, model 4(d)(2) (ie. stepwise selection with AIC selection) has the smallest AIC value at 734.73 and the best prediction accuracy at 78.71%.

AICSummary <- c(834.8884,810.0969,780.4776,779.3916,762.714,736.6964,734.7276,741.4095)
names(AICSummary) <- c("4a","4b","4c1","4c2","4c3","4d1","4d2","4d3")
sort(AICSummary)
##      4d2      4d1      4d3      4c3      4c2      4c1       4b       4a 
## 734.7276 736.6964 741.4095 762.7140 779.3916 780.4776 810.0969 834.8884
BICSummary <- c(854.0577,838.851,837.9857,827.315,820.2221,822.9586,811.4051,784.5406)
names(BICSummary) <- c("4a","4b","4c1","4c2","4c3","4d1","4d2","4d3")
sort(BICSummary)
##      4d3      4d2      4c3      4d1      4c2      4c1       4b       4a 
## 784.5406 811.4051 820.2221 822.9586 827.3150 837.9857 838.8510 854.0577
predAcc <- c(0.7631579,0.7631579,0.7703349,0.7703349,0.777512,0.7870813,0.7751196)
names(predAcc) <- c("4a","4b","4c1","4c2","4c3","4d2","4d3")
sort(predAcc)
##        4a        4b       4c1       4c2       4d3       4c3       4d2 
## 0.7631579 0.7631579 0.7703349 0.7703349 0.7751196 0.7775120 0.7870813

Part 5: Tree Models

Divide the data into training and test, then explore various tree models with all the variables except Life_boat, name, ticket and cabin.

library(rpart)
drops=c("Life_boat", "name", "ticket", "cabin")
trainSet = trainSet[, !names(trainSet) %in% drops]
testSet = testSet[, !names(testSet) %in% drops]

5a) After fitting a classification tree model with the initial control parameters minsplit = 5, cp = 0.000001, maxdepth = 30, method = "class", the selected cp value that has the minimum cross-validation estimate of misclassification error (xerror) is 0.007797271, where the optimal nsplit is 6.

# Tree model 1
tree1 <- rpart(survived ~ ., data = trainSet, minsplit = 5, cp = 0.000001, maxdepth = 30, method = "class")
plotcp(tree1)

tree1$cptable[which.min(tree1$cptable[,"xerror"]),"CP"]
## [1] 0.001949318
# Tree model 2, updated cp value
tree2 <- rpart(survived ~ ., data = trainSet, minsplit = 5, cp = 0.001949318, method = "class")
plotcp(tree2)