Titanic_Project

## Who Gets to Survive the Titanic Disaster?
## Chang Liu and Alexander Patel
## Math 156
## October 20, 2014

## The R Environment
# For graphs and charts, we will use the 'ggplot2' plotting library
#install.packages("ggplot2")
library("ggplot2")
#install.packages("scales")
library("scales")

## The Dataset
#  Our dataset comprises personal and logistical data on the
#  passengers on the Titanic. 
#  The data can be found at:
#   http://lib.stat.cmu.edu/S/Harrell/data/descriptions/titanic.html. 
titanic <- read.csv("titanic3.csv"); str(titanic)

## 'data.frame':    1310 obs. of  14 variables:
##  $ pclass   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : Factor w/ 1308 levels "","Abbing, Mr. Anthony",..: 23 25 26 27 28 32 47 48 52 56 ...
##  $ sex      : Factor w/ 3 levels "","female","male": 2 3 2 3 2 3 2 3 2 3 ...
##  $ age      : num  29 0.917 2 30 25 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : Factor w/ 930 levels "","110152","110413",..: 189 51 51 51 51 126 94 17 78 827 ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
##  $ embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 4 4 2 ...
##  $ boat     : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: Factor w/ 370 levels "","?Havana, Cuba",..: 310 232 232 232 232 238 163 25 23 230 ...

# The data is a sample of 1310 passengers from the population size of 1317
#  so it approximates well the population
# There are 14 columns, but we will be looking at 5:
#  * 3 categorial/logical: passenger class, sex, survival
#  * 2 numerical: age, fare
titanic <- data.frame(titanic$survived, titanic$age, titanic$sex, titanic$fare, titanic$pclass)
names(titanic) <- c("survived", "age", "sex", "fare", "pclass"); head(titanic)

##   survived     age    sex   fare pclass
## 1        1 29.0000 female 211.34      1
## 2        1  0.9167   male 151.55      1
## 3        0  2.0000 female 151.55      1
## 4        0 30.0000   male 151.55      1
## 5        0 25.0000 female 151.55      1
## 6        1 48.0000   male  26.55      1

## PART I: AGE
# The data set contains age data on ~80% of the passengers (1046 / 1310)
age.nna <- titanic[!is.na(titanic$age),] # strip data of rows with age=NULL
summary(age.nna$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.17   21.00   28.00   29.90   39.00   80.00

# Notice the min/max: 2 months versus 80 years!
# Let's overlay a PDG on a histogram of age
ggplot(age.nna, aes(x=age)) + 
    ggtitle("Passenger Age") + 
    xlab("Age") + 
    ylab("Density") + 
    geom_histogram(aes(y=..density..), binwidth=1)+
    geom_density(alpha=.5, fill="#FFFFFF")

plot of chunk unnamed-chunk-1

# We can see the discrepancies at extreme ages with a normal quantile plot
# The data is noticeably right-skewed, although there is clear dip in the number
#  of pre-teen and teenage passengers (for good reason, even in 1912)
qqnorm(age.nna$age, main="Passenger Age: Normal Quantile Plot")
qqline(age.nna$age)

plot of chunk unnamed-chunk-1

# Let's do a boxplot of age vs. gender and add in as a reference the 1912 U.S. Life Expectancy
#  from: http://demog.berkeley.edu/~andrew/1918/figure2.html
# Turns out, there are more "old" passengers than meets the eye!
p <- qplot(x=age.nna$sex, y=age.nna$age, data=age.nna, 
           geom=c("boxplot", "jitter"), main="Age vs. Gender", xlab="Gender", 
           ylab="Age") + coord_flip()
p + geom_hline(yintercept = 51.5, color="blue", label="Life Exp. (M)") + 
    geom_hline(yintercept = 55.9, colour="red", label="Life Exp. (F)")

plot of chunk unnamed-chunk-1

# Now, let's look at on particular category of the age group--children--and test 
#  whether the second part of "women and children first" (a code of conduct most famously associated with the Titanic) holds. 
# There are 154 children in the data set.
length(which(titanic$age < 18))

## [1] 154

# Let's add a new categorical variable for child/adult.
titanic$age_group <- "adult"
titanic$age_group[titanic$age < 18] <- "child"
# 52% of children and only 36% of adults survived.
# But the child population is much smaller than the adult one. 
age_group.survived <- table(titanic$age_group, titanic$survived); age_group.survived

##        
##           0   1
##   adult 736 419
##   child  73  81

age_group.survived.prop <- prop.table(age_group.survived, 1); age_group.survived.prop

##        
##              0      1
##   adult 0.6372 0.3628
##   child 0.4740 0.5260

# Is the difference in survival rates significant? Let's see with a permutation test.
adults.survived <- age_group.survived.prop[3]
children.survived <- age_group.survived.prop[4]
observed <-  children.survived / adults.survived; observed # children are 1.45x more likely to survive

## [1] 1.45

N=10^4-1 ; result<-numeric(N)  
for (i in 1:N) {
    index <- sample(nrow(titanic), size=154, replace=FALSE)
    child.sample <- length(which(titanic$survived[index] == 1)) / length(index)
    adult.sample <- length(which(titanic$survived[-index] == 1)) / (nrow(titanic) - length(index))
    result[i] <- child.sample / adult.sample 
}
qplot(result, binwidth=.05) + 
    geom_vline(xintercept = observed, color="red", label="Observed") +
    ggtitle("Permutation Test: Child Survival / Adult Survival") + 
    xlab("Ratio") + 
    ylab("Count")

plot of chunk unnamed-chunk-1

pvalue = (sum (result >= observed) + 1)/(N+1); 2*pvalue

## [1] 2e-04

# A near-zero p-value indicates that the evidence supports the hypothesis that 
#  children are more likely to have survived

## PART II: Gender
# extract passengers with non-null sex/survival data
index<-which((!is.na(titanic$sex) )&(!is.na(titanic$survived))); 
sex<-data.frame(titanic$survived[index],titanic$sex[index])
names(sex) <- c("survive ", "sex"); head(sex)

##   survive     sex
## 1        1 female
## 2        1   male
## 3        0 female
## 4        0   male
## 5        0 female
## 6        1   male

# 466 females and 843 males have survival data
nrow(subset(sex, sex == "female")); nrow(subset(sex, sex == "male"))

## [1] 466

## [1] 843

# a contingency table for gender
counts<-table(sex)[,c(0,2,3)]; counts

##         sex
## survive  female male
##        0    127  682
##        1    339  161

# Women's survival rate 
women.survived<-counts[2]/(counts[1]+counts[2]);women.survived

## [1] 0.7275

# Men's survival rate
men.survived <-counts[4]/(counts[3]+counts[4]);men.survived

## [1] 0.191

# Women are almost 4 times more likely to survive than men!
observed <- women.survived / men.survived; observed

## [1] 3.809

# Visualizing gender vs. survival
barplot(counts, ylim=c(0, 1000), xlab="Gender",ylab="Count",main="Survival by Sex")
text(.71, 520, paste(as.character(round(women.survived, 3) * 100),"% Survived"))
text(1.9, 890, paste(as.character(round(men.survived, 3 ) * 100),"% Survived"))
legend("bottomright", fill=c("black", "grey"), legend=c("Perished", "Survived"))

plot of chunk unnamed-chunk-1

# Is the survival ratio of women over men statistically significant? 
# Let's do two tests. First, a permutation test for the ratio of women and men who survived
N=10^3-1 ; result<-numeric(N)  
for (i in 1:N) {
    per.sex<-sample(titanic$sex)
    counts2<-table(titanic$survived,per.sex);counts2<-counts2[,c(0,2,3)]
    women<-counts2[2]/(counts2[1]+counts2[2]) # women's survival rate 
    men<-counts2[4]/(counts2[3]+counts2[4]) # men's survival rate
    result[i]<-women/men  
}
qplot(result, binwidth=.05) + 
    geom_vline(xintercept = observed, color="red", label="Observed") + 
    xlim(0, 4) +
    ggtitle("Permutation Test: Female Survival / Male Survival") + 
    xlab("Ratio") + 
    ylab("Count")

plot of chunk unnamed-chunk-1

pValue = (sum (result >= observed) + 1)/(N+1); 2*pValue #double for 2-sided test

## [1] 0.002

# Far below 1% pvalue level: the observed ratio is extremely unlikely to occur by chance! 
# We have reason now to belive that women are saved first

# Now let's perform a chi-square test to further verify our conviction: 
# NUll hypothesis: sex and survival are independent VS
# Alternative hypothesis - women more likely to survive  
chisq.test(counts)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  counts
## X-squared = 363.6, df = 1, p-value < 2.2e-16

# first glance: two variables highly dependent with statistical significance
chisq1 <-function(Obs){
    Expected <- rep(sum(Obs)/length(Obs),length(Obs))
    sum((Obs-Expected)^2/Expected)
}
#Calculate chi square for the observed data
Chi1<-chisq1(counts);Chi1

## [1] 592

# calculate its p-value 
pchisq(Chi1,1,lower.tail=FALSE)

## [1] 9.305e-131

# extremely small at 1% p-value level
# strongly suggesting the relationship is not indepedent 

# BONUS point: advantage of simulation over classical chisquare test

# Although in agreement on the inference, however, the large discrepany 
# in values between the built-in test and our calculation makes us nervous 
# e.g.p-values: 2.2e-16 for built-in; 9.304668e-131 for calculation
# The same issue appears in the analysis of class and survival!
# But we have no easy way to find out why! 
# Also the degree of power in the values (esp. -131?!) are just way too high
# Let's see if simulation can remedy this issue. 

# simulate the results
N = 10^4-1 ; sex.result <- numeric(N)
for (i in 1:N) {
    per_sex<- sample(titanic$sex);
    counts3<-table(titanic$survived,per_sex)
    counts3<-counts3[,c(0,2,3)]
    sex.result[i]<-chisq1(counts3)
}
qplot(sex.result, binwidth=1) + 
    geom_vline(xintercept = Chi1, color="red", label="Observed") + 
    xlim(100, 600) +
    ggtitle("Simulation: Sex Survival") + 
    xlab("Chi-squared") + 
    ylab("Count")

plot of chunk unnamed-chunk-1

pVal <-(sum (sex.result >= Chi1) +1)/(N+1); pVal;pVal*2

## [1] 1e-04

## [1] 2e-04

# Not only the pvalue from simulation clearly is more reasonable
# but also it confirms the same inference without giving 
# the unknown discrepany and extremeness in values in the classical method.

## PART III: Passenger Class and Fare
# Fare
fare <- titanic$fare
# we have fare data for most of the passengers in the data set
length(fare); index<-which(!is.na(fare)); fare<-fare[index]; length(fare)

## [1] 1310

## [1] 1308

hist(fare,50,col="blue",freq=FALSE, main="Ticket Fare")

plot of chunk unnamed-chunk-1

# highly skewed right with a long tail
summary(fare)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.9    14.5    33.3    31.3   512.0

# Most fares are less than 34 Pre-1970 British Pounds 
# but there are seats as expensive as 512 Pounds and as cheap as free
qqnorm(fare)
qqline(fare)

plot of chunk unnamed-chunk-1

# Passenger class is a proxy for social-economic status
pclass <- titanic$pclass
survive <- titanic$survived
barplot(table(pclass),xlab="Passenger Class",ylab="Number of People",main="Distribution of Passengers by Class")

plot of chunk unnamed-chunk-1

# the ship has slight more 3rd class passengers than the sum of 1st and 2nd class
counts<-table(survive,pclass);counts

##        pclass
## survive   1   2   3
##       0 123 158 528
##       1 200 119 181

chisq.test(counts) # far below 1% pvalue level suggests high dependence for a large chisq

## 
##  Pearson's Chi-squared test
## 
## data:  counts
## X-squared = 127.9, df = 2, p-value < 2.2e-16

# plot a graph to support our inference
barplot(counts, xlab="passenger class",ylab="number of people",main="survival by class")

plot of chunk unnamed-chunk-1

# it seems first class has more survival than other classes
# could this be due to some chance event?

# testing hypothesis: do rich passengers get saved more often? 
chisq1 <-function(Obs){
  Expected <- rep(sum(Obs)/length(Obs),length(Obs))
  sum((Obs-Expected)^2/Expected)
}
Chi1<-chisq1(counts);Chi1

## [1] 551

#a large chi-square value for the observed data
pchisq(Chi1,2,lower.tail=FALSE) # highly significant

## [1] 2.202e-120

# A small p-value with a large chisq value suggests high dependence
# between survivalibity and class

# Again, the unknown discrepany and extremeness in values appear in the classical method. 
# But we can also simulate the results to verify the same thing for comparison.
N = 10^3-1 ; class.result <- numeric(N)
for (i in 1:N) {
  per_class<- sample(pclass)
  class.result[i]<-chisq1(table(per_class,survive))
}
qplot(class.result, binwidth=1) + 
    geom_vline(xintercept = Chi1, color="red", label="Observed") + 
    xlim(300,600) +
    ggtitle("Simulation: Survival by Class") + 
    xlab("chi-squared") + 
    ylab("Count")

plot of chunk unnamed-chunk-1

pVal <-(sum (class.result >= Chi1) +1)/(N+1); pVal*2 # below 1% level

## [1] 0.002

# Simulation clearly verifies the same conclusion without giving the mystery.
# It works practically better.

# The evidence supports the hypothesis that passengers in lower-numbered passenger classes are
#  more likely to have survived.the sinking of the Titanic.

# Conclusion:
# Women, children, and the first class are more likely to survive than others

Titanic_Project_Chang.R

liuchang

Tue Dec 23 23:26:35 2014