Load the training and testing data:
training = read.csv("train.csv")
testing = read.csv("test.csv")
dim(training)
## [1] 891 12
dim(testing)
## [1] 418 11
str(training)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
summary(training)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
Randomly pick up 5 variables and create a scatter-plot matrix:
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
featurePlot(x = training[, c(3, 5, 6, 8, 11)], y = training$Survived, plot = "pairs", auto.key = list(columns = 5))
Select useful predictors
trainingSel = subset(training, select = c('Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'))
testingSel = subset(testing, select = c('Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'))
head(trainingSel)
## Survived Pclass Sex Age SibSp Parch Embarked
## 1 0 3 male 22 1 0 S
## 2 1 1 female 38 1 0 C
## 3 1 3 female 26 0 0 S
## 4 1 1 female 35 1 0 S
## 5 0 3 male 35 0 0 S
## 6 0 3 male NA 0 0 Q
head(testingSel)
## Pclass Sex Age SibSp Parch Embarked
## 1 3 male 34.5 0 0 Q
## 2 3 female 47.0 1 0 S
## 3 2 male 62.0 0 0 Q
## 4 3 male 27.0 0 0 S
## 5 3 female 22.0 1 1 S
## 6 3 male 14.0 0 0 S
Remove Embaked with empty value
trainingSel = subset(trainingSel, Embarked != '')
trainingSel$Embarked = droplevels(trainingSel$Embarked, "")
testingSel = subset(testingSel, Embarked != '')
Replace NA age with the median of Age
agemedian = median(trainingSel$Age, na.rm = TRUE)
trainingSel$Age = replace(trainingSel$Age, is.na(trainingSel$Age), agemedian)
testingSel$Age = replace(testing$Age, is.na(testing$Age), agemedian)
Separate the predictors and class labels.
trainingVars = trainingSel[, c(2:7)]
objLabels = trainingSel[, 1]
objLabels = as.factor(objLabels)
testingVars = testingSel
testingPsgIds = testing[, 1]
The pre-processed training data is split into two subsets: 75% for training prediction models and 25% as validation set for evaluating error rates and selecting models.
inTrain = createDataPartition(objLabels, p = 0.75, list = FALSE)
forTrainingVars = trainingVars[inTrain, ]
forTestingVars = trainingVars[-inTrain, ]
forTrainingLabels = objLabels[inTrain]
forTestingLabels = objLabels[-inTrain]
For this classification problem, I first train a decision tree and evaluate its error rate on the validation set. I will use the error rate of the decision tree as a baseline error rate. In the later part of the document, I will train other models and use cross validation to select the best one.
Train a decision tree using the caret package.
set.seed(2345)
modeldt = train(y = forTrainingLabels, x = forTrainingVars, method = "rpart")
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.1.3
Predict the labels for the validation set and estimate the error rate.
preddt = predict(modeldt, forTestingVars)
confusionMatrix(forTestingLabels, preddt)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 131 6
## 1 35 50
##
## Accuracy : 0.8153
## 95% CI : (0.7579, 0.8641)
## No Information Rate : 0.7477
## P-Value [Acc > NIR] : 0.0107
##
## Kappa : 0.5821
## Mcnemar's Test P-Value : 1.226e-05
##
## Sensitivity : 0.7892
## Specificity : 0.8929
## Pos Pred Value : 0.9562
## Neg Pred Value : 0.5882
## Prevalence : 0.7477
## Detection Rate : 0.5901
## Detection Prevalence : 0.6171
## Balanced Accuracy : 0.8410
##
## 'Positive' Class : 0
##
The accuracy of the decision tree is (0.8198). To improve the accuracy, I will train several different models in the following.
Train a naive Bayes model and estimate its accuracy.
set.seed(2345)
library(klaR)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.1.3
library(MASS)
modelnb = train(y = forTrainingLabels, x = forTrainingVars, method = "nb", )
prednb = predict(modelnb, forTestingVars)
confusionMatrix(forTestingLabels, prednb)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 123 14
## 1 36 49
##
## Accuracy : 0.7748
## 95% CI : (0.7141, 0.828)
## No Information Rate : 0.7162
## P-Value [Acc > NIR] : 0.029309
##
## Kappa : 0.4988
## Mcnemar's Test P-Value : 0.002979
##
## Sensitivity : 0.7736
## Specificity : 0.7778
## Pos Pred Value : 0.8978
## Neg Pred Value : 0.5765
## Prevalence : 0.7162
## Detection Rate : 0.5541
## Detection Prevalence : 0.6171
## Balanced Accuracy : 0.7757
##
## 'Positive' Class : 0
##
The accuracy of the naive Bayes model is (0.8108).
Train a random forest model using the the training data as used in training the naive Bayes model.
modelrf = train(y = forTrainingLabels, x = forTrainingVars, method = "rf", )
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
predrf = predict(modelrf, forTestingVars)
confusionMatrix(forTestingLabels, predrf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 130 7
## 1 34 51
##
## Accuracy : 0.8153
## 95% CI : (0.7579, 0.8641)
## No Information Rate : 0.7387
## P-Value [Acc > NIR] : 0.004683
##
## Kappa : 0.5841
## Mcnemar's Test P-Value : 4.896e-05
##
## Sensitivity : 0.7927
## Specificity : 0.8793
## Pos Pred Value : 0.9489
## Neg Pred Value : 0.6000
## Prevalence : 0.7387
## Detection Rate : 0.5856
## Detection Prevalence : 0.6171
## Balanced Accuracy : 0.8360
##
## 'Positive' Class : 0
##
The accuracy of the random forest model trained by using all the available training data is (0.8378).
I am curious about whether we can select a smaller set of features/predictors to further improve the training process, e.g., better accuracy or faster training time.
First, create an integer vector for the specific subset sizes of the predictors that should be tested.
subsets <- c(1:6)
Use recursive feature elimination via caret to find the important features. We train a series random forest models and select features through repeated cross validations on different sizes of feature sets.
set.seed(2345)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
rfProfile <- rfe(forTrainingVars, forTrainingLabels,
sizes = subsets,
rfeControl = ctrl)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
rfProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.7871 0.5435 0.04317 0.09249
## 2 0.7907 0.5121 0.04217 0.10229
## 3 0.8047 0.5572 0.04102 0.09712
## 4 0.8167 0.5981 0.03291 0.07551
## 5 0.8180 0.6005 0.03991 0.09055
## 6 0.8227 0.6094 0.03647 0.08223 *
##
## The top 5 variables (out of 6):
## Sex, Pclass, Age, SibSp, Parch
Extract the data.
featureEliTrainingVars = forTrainingVars[, c("Sex", "Pclass", "Age", "Parch", "Embarked")]
featureEliTestingVars = forTestingVars[, c("Sex", "Pclass", "Age", "Parch", "Embarked")]
featureEliFinalTestingVars = testingVars[, c("Sex", "Pclass", "Age", "Parch", "Embarked")]
dim(featureEliTrainingVars)
## [1] 667 5
dim(featureEliTestingVars)
## [1] 222 5
dim(featureEliFinalTestingVars)
## [1] 418 5
Train a random forest model using only a subset of the variables.
modelrf4vars = train(y = forTrainingLabels, x = featureEliTrainingVars, method = "rf", )
predrf4vars = predict(modelrf4vars, featureEliTestingVars)
confusionMatrix(forTestingLabels, predrf4vars)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 132 5
## 1 37 48
##
## Accuracy : 0.8108
## 95% CI : (0.753, 0.8601)
## No Information Rate : 0.7613
## P-Value [Acc > NIR] : 0.04648
##
## Kappa : 0.5689
## Mcnemar's Test P-Value : 1.724e-06
##
## Sensitivity : 0.7811
## Specificity : 0.9057
## Pos Pred Value : 0.9635
## Neg Pred Value : 0.5647
## Prevalence : 0.7613
## Detection Rate : 0.5946
## Detection Prevalence : 0.6171
## Balanced Accuracy : 0.8434
##
## 'Positive' Class : 0
##
The accuracy of the model trained by only a subset of the variables is (0.8198).
Apply the random forest models trained by the full set of variables and the reduced set of variables to the test set. Check their prediction agreement.
resultsrf4vars = predict(modelrf4vars, featureEliFinalTestingVars)
resultsrf = predict(modelrf, testingVars)
confusionMatrix(resultsrf4vars, resultsrf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 296 14
## 1 1 107
##
## Accuracy : 0.9641
## 95% CI : (0.9415, 0.9798)
## No Information Rate : 0.7105
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9099
## Mcnemar's Test P-Value : 0.001946
##
## Sensitivity : 0.9966
## Specificity : 0.8843
## Pos Pred Value : 0.9548
## Neg Pred Value : 0.9907
## Prevalence : 0.7105
## Detection Rate : 0.7081
## Detection Prevalence : 0.7416
## Balanced Accuracy : 0.9405
##
## 'Positive' Class : 0
##
The two models agree about (95%) of the test cases.
Make submission file
resultsrfint = as.numeric(as.character(resultsrf))
ids_results_rf = as.data.frame(cbind(testingPsgIds, resultsrfint))