library(MASS)
library(ggplot2)
train <- read.csv("titanic_train.csv")
test <- read.csv("titanic_test.csv")
test$Survived <- NA
all <- rbind(train, test)
colSums(is.na(all))
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 0 0
space = function (x) {sum(x=="") }
apply(all, 2, space)
## PassengerId Survived Pclass Name Sex Age
## 0 NA 0 0 0 NA
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 NA 1014 2
Now that we have 263 observations for which age is missing lets try to predict it using fare and Passenger class
ggplot(data = all, aes(x = Fare, y = Age, col = factor(Pclass))) +
geom_smooth()
index <- which(is.na(all$Age))
predict_age <- all[index,]
train_age <- all[-index,]
lm_age <- lm(Age ~ Pclass + Fare + Pclass:Fare, data = train_age)
summary(lm_age)
##
## Call:
## lm(formula = Age ~ Pclass + Fare + Pclass:Fare, data = train_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.908 -8.117 -1.360 7.775 47.876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.60475 1.56595 29.123 < 2e-16 ***
## Pclass -5.64937 0.67503 -8.369 < 2e-16 ***
## Fare 0.14181 0.02824 5.021 6.03e-07 ***
## Pclass:Fare -0.15585 0.02583 -6.033 2.23e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.88 on 1041 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2012, Adjusted R-squared: 0.1988
## F-statistic: 87.38 on 3 and 1041 DF, p-value: < 2.2e-16
predict_age$Age <- as.integer(predict(lm_age, newdata = predict_age))
all <- rbind(train_age, predict_age)
colSums(is.na(all))
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 0 0
all$family <- all$SibSp + all$Parch + 1
Split data for Survival Prediction
#Convert categorical variable to factor type
all$Survived <- as.factor(all$Survived)
all$Pclass <- as.factor(all$Pclass)
all$Pclass <- as.factor(all$Pclass)
index <- which(is.na(all$Survived))
train <- all[-index,]
test <- all[index,]
str(all)
## 'data.frame': 1309 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 520 629 417 581 732 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ SibSp : int 1 1 0 1 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 86 396 345 133 617 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ family : num 2 2 1 2 1 1 5 3 2 3 ...
index <- sample(1:nrow(train), nrow(train)*0.8)
train_t <- train[index,]
test_t <- train[-index,]
log.fit <- glm(Survived ~ Pclass + Sex + Age + Fare + Embarked + family, family = "binomial", data = train_t)
summary(log.fit)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare + Embarked +
## family, family = "binomial", data = train_t)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6976 -0.6187 -0.4067 0.6138 2.5528
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.401549 535.411352 0.029 0.97705
## Pclass2 -1.259346 0.337487 -3.732 0.00019 ***
## Pclass3 -2.543961 0.351639 -7.235 4.67e-13 ***
## Sexmale -2.838304 0.227889 -12.455 < 2e-16 ***
## Age -0.045196 0.009507 -4.754 2.00e-06 ***
## Fare 0.001457 0.002501 0.583 0.56006
## EmbarkedC -10.749439 535.411267 -0.020 0.98398
## EmbarkedQ -10.681581 535.411354 -0.020 0.98408
## EmbarkedS -10.981700 535.411256 -0.021 0.98364
## family -0.234607 0.077335 -3.034 0.00242 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 943.08 on 711 degrees of freedom
## Residual deviance: 615.14 on 702 degrees of freedom
## AIC: 635.14
##
## Number of Fisher Scoring iterations: 12
log.fit2 <- glm(Survived ~ Pclass +Sex + Age + family , family = "binomial", data = train_t)
summary(log.fit2)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + family, family = "binomial",
## data = train_t)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7178 -0.5939 -0.4021 0.6187 2.5351
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.711912 0.527812 8.927 < 2e-16 ***
## Pclass2 -1.424161 0.300026 -4.747 2.07e-06 ***
## Pclass3 -2.670890 0.297451 -8.979 < 2e-16 ***
## Sexmale -2.876431 0.225357 -12.764 < 2e-16 ***
## Age -0.046654 0.009465 -4.929 8.26e-07 ***
## family -0.237618 0.074528 -3.188 0.00143 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 943.08 on 711 degrees of freedom
## Residual deviance: 617.04 on 706 degrees of freedom
## AIC: 629.04
##
## Number of Fisher Scoring iterations: 5
test_t$prob <- predict(log.fit2, newdata = test_t, type = "response")
test_t$pred <- ifelse(test_t$prob > 0.5, 1 , 0)
table(test_t$pred, test_t$Survived)
##
## 0 1
## 0 94 27
## 1 11 47
lda.model = lda (Survived ~ Pclass + Sex + Age + Parch, data=train_t)
lda.model
## Call:
## lda(Survived ~ Pclass + Sex + Age + Parch, data = train_t)
##
## Prior probabilities of groups:
## 0 1
## 0.6235955 0.3764045
##
## Group means:
## Pclass2 Pclass3 Sexmale Age Parch
## 0 0.1869369 0.6644144 0.8536036 29.63626 0.3265766
## 1 0.2611940 0.3432836 0.3134328 28.36881 0.4253731
##
## Coefficients of linear discriminants:
## LD1
## Pclass2 -0.88656018
## Pclass3 -1.67765423
## Sexmale -2.19786825
## Age -0.02424166
## Parch -0.17545492
plot(lda.model)
lda_prob <- predict(lda.model, newdata = test_t)
table(Predicted = lda_prob$class, test_t$Survived)
##
## Predicted 0 1
## 0 91 26
## 1 14 48
qda.model = qda (Survived ~ Pclass + Sex + Age + Parch, data=train_t)
qda.model
## Call:
## qda(Survived ~ Pclass + Sex + Age + Parch, data = train_t)
##
## Prior probabilities of groups:
## 0 1
## 0.6235955 0.3764045
##
## Group means:
## Pclass2 Pclass3 Sexmale Age Parch
## 0 0.1869369 0.6644144 0.8536036 29.63626 0.3265766
## 1 0.2611940 0.3432836 0.3134328 28.36881 0.4253731
qda_prob <- predict(qda.model, newdata = test_t)
table(Predicted = qda_prob$class, test_t$Survived)
##
## Predicted 0 1
## 0 87 23
## 1 18 51
LDA and QDA work well when class separation and normality assumption holds true in the dataset. If the dataset is not normal then Logistic regression has an edge over LDA and QDA model. Logistic regression does not work properly if the response classes are fully separated from each other. In general, logistic regression is used for binomial classification and in case of multiple response classes, LDA and QDA are more popular.