R Markdown
library(knitr)
library(ggplot2)
library(gridExtra)
library(scales)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
train <- na.omit(read.csv("C:\\Users\\pranit\\Documents\\Imarticus\\Model\\train.csv"))
test <- na.omit(read.csv("C:\\Users\\pranit\\Documents\\Imarticus\\Model\\test.csv"))
allSet <- bind_rows(train, test)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
summary(allSet)
## PassengerId Survived Pclass Name
## Min. : 1 Min. :0.0000 Min. :1.000 Length:1045
## 1st Qu.: 326 1st Qu.:0.0000 1st Qu.:1.000 Class :character
## Median : 662 Median :0.0000 Median :2.000 Mode :character
## Mean : 655 Mean :0.4062 Mean :2.207
## 3rd Qu.: 973 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1307 Max. :1.0000 Max. :3.000
## NA's :331
## Sex Age SibSp Parch
## female:388 Min. : 0.17 Min. :0.0000 Min. :0.0000
## male :657 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median :28.00 Median :0.0000 Median :0.0000
## Mean :29.85 Mean :0.5033 Mean :0.4211
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :80.00 Max. :8.0000 Max. :6.0000
##
## Ticket Fare Cabin Embarked
## Length:1045 Min. : 0.00 Length:1045 Length:1045
## Class :character 1st Qu.: 8.05 Class :character Class :character
## Mode :character Median : 15.75 Mode :character Mode :character
## Mean : 36.69
## 3rd Qu.: 35.50
## Max. :512.33
##
str(allSet)
## 'data.frame': 1045 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : int 0 1 1 1 0 0 0 1 1 1 ...
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ SibSp : int 1 1 0 1 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
allSet$Survived <- as.factor(allSet$Survived)
allSet$Pclass <- as.factor(allSet$Pclass)
allSet$Sex <- as.factor(allSet$Sex)
allSet$Cabin <- as.factor(allSet$Cabin)
allSet$Embarked <- as.factor(allSet$Embarked)
split the train data set into train and test data sets (75/25 ratio).
number of training rows
nTrain <- round(0.75 * nrow(train))
sample row IDs
sampleTrain <- sample(nrow(train),nTrain)
create trainTemp and testTemp data sets
trainTemp <- allSet[sampleTrain,]
testTemp <- allSet[-sampleTrain,]
set.seed(123)
Training model
Random Forest
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
modelRF <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked
, data = trainTemp, method = "rf",
trControl = control)
print(modelRF)
## Random Forest
##
## 536 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 483, 483, 482, 482, 482, 483, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8085954 0.5870742
## 5 0.8122874 0.6018561
## 9 0.8128116 0.6047217
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
rfPred <- predict(modelRF, testTemp)
rfPred
## [1] 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
## [36] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 1
## [71] 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 0
## [211] 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0
## [246] 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0
## [316] 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0
## Levels: 0 1
rfCM <- confusionMatrix(rfPred, testTemp$Survived)
rfCM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93 38
## 1 8 39
##
## Accuracy : 0.7416
## 95% CI : (0.6707, 0.8042)
## No Information Rate : 0.5674
## P-Value [Acc > NIR] : 1.083e-06
##
## Kappa : 0.448
## Mcnemar's Test P-Value : 1.904e-05
##
## Sensitivity : 0.9208
## Specificity : 0.5065
## Pos Pred Value : 0.7099
## Neg Pred Value : 0.8298
## Prevalence : 0.5674
## Detection Rate : 0.5225
## Detection Prevalence : 0.7360
## Balanced Accuracy : 0.7136
##
## 'Positive' Class : 0
##
Gradient Boosting
modelGBM <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked,
data = trainTemp, method = "gbm",
trControl = control, verbose = FALSE)
print(modelGBM)
## Stochastic Gradient Boosting
##
## 536 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 482, 483, 482, 483, 483, 482, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8034536 0.5887122
## 1 100 0.8071810 0.5959242
## 1 150 0.8121892 0.6058379
## 2 50 0.8122362 0.6011295
## 2 100 0.8128423 0.6020881
## 2 150 0.8122250 0.6008935
## 3 50 0.8146708 0.6046750
## 3 100 0.8115960 0.5984884
## 3 150 0.8153006 0.6049077
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
gbpred <- predict(modelGBM, testTemp)
gbpred
## [1] 0 1 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0
## [36] 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1
## [71] 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0
## [211] 1 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 1 0
## [246] 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0
## [316] 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0
## Levels: 0 1
gbmCM <- confusionMatrix(gbpred, testTemp$Survived)
gbmCM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 89 37
## 1 12 40
##
## Accuracy : 0.7247
## 95% CI : (0.6529, 0.7889)
## No Information Rate : 0.5674
## P-Value [Acc > NIR] : 1.049e-05
##
## Kappa : 0.4167
## Mcnemar's Test P-Value : 0.0006068
##
## Sensitivity : 0.8812
## Specificity : 0.5195
## Pos Pred Value : 0.7063
## Neg Pred Value : 0.7692
## Prevalence : 0.5674
## Detection Rate : 0.5000
## Detection Prevalence : 0.7079
## Balanced Accuracy : 0.7003
##
## 'Positive' Class : 0
##
Support Vector Machine
modelSVM <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked,
data = trainTemp, method = "svmRadial",
trControl = control)
print(modelSVM)
## Support Vector Machines with Radial Basis Function Kernel
##
## 536 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 482, 482, 483, 483, 483, 482, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8160350 0.6034106
## 0.50 0.8172808 0.6084030
## 1.00 0.8172463 0.6093012
##
## Tuning parameter 'sigma' was held constant at a value of 0.1302207
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1302207 and C = 0.5.
svmPred <- predict(modelSVM, testTemp)
svmPred
## [1] 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0
## [36] 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0
## [71] 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0
## [211] 1 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0
## [246] 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0
## [316] 0 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0
## Levels: 0 1
svmCM <- confusionMatrix(svmPred, testTemp$Survived)
svmCM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 84 36
## 1 17 41
##
## Accuracy : 0.7022
## 95% CI : (0.6293, 0.7683)
## No Information Rate : 0.5674
## P-Value [Acc > NIR] : 0.0001486
##
## Kappa : 0.3751
## Mcnemar's Test P-Value : 0.0134175
##
## Sensitivity : 0.8317
## Specificity : 0.5325
## Pos Pred Value : 0.7000
## Neg Pred Value : 0.7069
## Prevalence : 0.5674
## Detection Rate : 0.4719
## Detection Prevalence : 0.6742
## Balanced Accuracy : 0.6821
##
## 'Positive' Class : 0
##
Compararing accuracy
modelRFacc <- percent(as.numeric(rfCM$overall[1]))
modelRFerr <- percent(1-(as.numeric(rfCM$overall[1])))
modelGBMacc <- percent(as.numeric(gbmCM$overall[1]))
modelGBMerr <- percent(1-(as.numeric(gbmCM$overall[1])))
modelSVMacc <- percent(as.numeric(svmCM$overall[1]))
modelSVMerr <- percent(1-(as.numeric(svmCM$overall[1])))
Results
tblAcc <- data.frame("Accuracy" = c(modelRFacc, modelGBMacc, modelSVMacc),
" Error" = c(modelRFerr,modelGBMerr,modelSVMerr),
row.names = c("RF","GBM","SVM"))
tblAcc