R Markdown

library(knitr)
library(ggplot2)
library(gridExtra)
library(scales)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: lattice

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(caret)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

train <- na.omit(read.csv("C:\\Users\\pranit\\Documents\\Imarticus\\Model\\train.csv"))
test <- na.omit(read.csv("C:\\Users\\pranit\\Documents\\Imarticus\\Model\\test.csv"))
allSet <- bind_rows(train, test)

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

summary(allSet)

##   PassengerId      Survived          Pclass          Name          
##  Min.   :   1   Min.   :0.0000   Min.   :1.000   Length:1045       
##  1st Qu.: 326   1st Qu.:0.0000   1st Qu.:1.000   Class :character  
##  Median : 662   Median :0.0000   Median :2.000   Mode  :character  
##  Mean   : 655   Mean   :0.4062   Mean   :2.207                     
##  3rd Qu.: 973   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :1307   Max.   :1.0000   Max.   :3.000                     
##                 NA's   :331                                        
##      Sex           Age            SibSp            Parch       
##  female:388   Min.   : 0.17   Min.   :0.0000   Min.   :0.0000  
##  male  :657   1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.0000  
##               Median :28.00   Median :0.0000   Median :0.0000  
##               Mean   :29.85   Mean   :0.5033   Mean   :0.4211  
##               3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##               Max.   :80.00   Max.   :8.0000   Max.   :6.0000  
##                                                                
##     Ticket               Fare           Cabin             Embarked        
##  Length:1045        Min.   :  0.00   Length:1045        Length:1045       
##  Class :character   1st Qu.:  8.05   Class :character   Class :character  
##  Mode  :character   Median : 15.75   Mode  :character   Mode  :character  
##                     Mean   : 36.69                                        
##                     3rd Qu.: 35.50                                        
##                     Max.   :512.33                                        
##

str(allSet)

## 'data.frame':    1045 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived   : int  0 1 1 1 0 0 0 1 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 1 3 3 2 3 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age        : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ SibSp      : int  1 1 0 1 0 0 3 0 1 1 ...
##  $ Parch      : int  0 0 0 0 0 0 1 2 0 1 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

allSet$Survived <- as.factor(allSet$Survived)
allSet$Pclass <- as.factor(allSet$Pclass)
allSet$Sex <- as.factor(allSet$Sex)
allSet$Cabin <- as.factor(allSet$Cabin)
allSet$Embarked <- as.factor(allSet$Embarked)

split the train data set into train and test data sets (75/25 ratio).

number of training rows

nTrain <- round(0.75 * nrow(train))

sample row IDs

sampleTrain <- sample(nrow(train),nTrain)

create trainTemp and testTemp data sets

trainTemp <- allSet[sampleTrain,]
testTemp <- allSet[-sampleTrain,]    

set.seed(123)

Training model

Random Forest

control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

modelRF <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked   
                , data = trainTemp, method = "rf", 
                    trControl = control)
print(modelRF)

## Random Forest 
## 
## 536 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 483, 483, 482, 482, 482, 483, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8085954  0.5870742
##   5     0.8122874  0.6018561
##   9     0.8128116  0.6047217
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.

rfPred <- predict(modelRF, testTemp)
rfPred

##   [1] 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
##  [36] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 1
##  [71] 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 0
## [211] 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0
## [246] 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0
## [316] 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0
## Levels: 0 1

rfCM <- confusionMatrix(rfPred, testTemp$Survived)
rfCM

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 93 38
##          1  8 39
##                                           
##                Accuracy : 0.7416          
##                  95% CI : (0.6707, 0.8042)
##     No Information Rate : 0.5674          
##     P-Value [Acc > NIR] : 1.083e-06       
##                                           
##                   Kappa : 0.448           
##  Mcnemar's Test P-Value : 1.904e-05       
##                                           
##             Sensitivity : 0.9208          
##             Specificity : 0.5065          
##          Pos Pred Value : 0.7099          
##          Neg Pred Value : 0.8298          
##              Prevalence : 0.5674          
##          Detection Rate : 0.5225          
##    Detection Prevalence : 0.7360          
##       Balanced Accuracy : 0.7136          
##                                           
##        'Positive' Class : 0               
##

Gradient Boosting

modelGBM <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked,
                  data = trainTemp, method = "gbm",
                  trControl = control, verbose = FALSE)

print(modelGBM)

## Stochastic Gradient Boosting 
## 
## 536 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 482, 483, 482, 483, 483, 482, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8034536  0.5887122
##   1                  100      0.8071810  0.5959242
##   1                  150      0.8121892  0.6058379
##   2                   50      0.8122362  0.6011295
##   2                  100      0.8128423  0.6020881
##   2                  150      0.8122250  0.6008935
##   3                   50      0.8146708  0.6046750
##   3                  100      0.8115960  0.5984884
##   3                  150      0.8153006  0.6049077
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

gbpred <- predict(modelGBM, testTemp)
gbpred

##   [1] 0 1 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0
##  [36] 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1
##  [71] 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0
## [211] 1 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 1 0
## [246] 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0
## [316] 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0
## Levels: 0 1

gbmCM <- confusionMatrix(gbpred, testTemp$Survived)
gbmCM

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 89 37
##          1 12 40
##                                           
##                Accuracy : 0.7247          
##                  95% CI : (0.6529, 0.7889)
##     No Information Rate : 0.5674          
##     P-Value [Acc > NIR] : 1.049e-05       
##                                           
##                   Kappa : 0.4167          
##  Mcnemar's Test P-Value : 0.0006068       
##                                           
##             Sensitivity : 0.8812          
##             Specificity : 0.5195          
##          Pos Pred Value : 0.7063          
##          Neg Pred Value : 0.7692          
##              Prevalence : 0.5674          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.7079          
##       Balanced Accuracy : 0.7003          
##                                           
##        'Positive' Class : 0               
##

Support Vector Machine

modelSVM <- train(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked,
                  data = trainTemp, method = "svmRadial",
                  trControl = control)

print(modelSVM)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 536 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 482, 482, 483, 483, 483, 482, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.8160350  0.6034106
##   0.50  0.8172808  0.6084030
##   1.00  0.8172463  0.6093012
## 
## Tuning parameter 'sigma' was held constant at a value of 0.1302207
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1302207 and C = 0.5.

svmPred <- predict(modelSVM, testTemp)
svmPred

##   [1] 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0
##  [36] 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0
##  [71] 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0
## [106] 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1
## [141] 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1
## [176] 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0
## [211] 1 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0
## [246] 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 1 1 1 0 1 0
## [281] 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0
## [316] 0 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0
## [351] 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0
## [386] 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0
## [421] 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0
## [456] 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0
## [491] 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0
## Levels: 0 1

svmCM <- confusionMatrix(svmPred, testTemp$Survived)
svmCM

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 84 36
##          1 17 41
##                                           
##                Accuracy : 0.7022          
##                  95% CI : (0.6293, 0.7683)
##     No Information Rate : 0.5674          
##     P-Value [Acc > NIR] : 0.0001486       
##                                           
##                   Kappa : 0.3751          
##  Mcnemar's Test P-Value : 0.0134175       
##                                           
##             Sensitivity : 0.8317          
##             Specificity : 0.5325          
##          Pos Pred Value : 0.7000          
##          Neg Pred Value : 0.7069          
##              Prevalence : 0.5674          
##          Detection Rate : 0.4719          
##    Detection Prevalence : 0.6742          
##       Balanced Accuracy : 0.6821          
##                                           
##        'Positive' Class : 0               
##

Compararing accuracy

modelRFacc <- percent(as.numeric(rfCM$overall[1]))
modelRFerr <- percent(1-(as.numeric(rfCM$overall[1])))


modelGBMacc <- percent(as.numeric(gbmCM$overall[1]))
modelGBMerr <- percent(1-(as.numeric(gbmCM$overall[1])))


modelSVMacc <- percent(as.numeric(svmCM$overall[1]))
modelSVMerr <- percent(1-(as.numeric(svmCM$overall[1])))

Results

tblAcc <- data.frame("Accuracy" = c(modelRFacc, modelGBMacc, modelSVMacc), 
                     " Error" = c(modelRFerr,modelGBMerr,modelSVMerr),
                     row.names = c("RF","GBM","SVM"))

tblAcc

Model Comparison

Pranit Shinde

3 December 2018

R Markdown

split the train data set into train and test data sets (75/25 ratio).

number of training rows

sample row IDs

create trainTemp and testTemp data sets

Training model

Random Forest

Gradient Boosting

Support Vector Machine

Compararing accuracy

Results