Including Plots for EDA

## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

## # A tibble: 5 x 2
##   Rating `median(Age)`
##    <int>         <dbl>
## 1      1            42
## 2      2            41
## 3      3            40
## 4      4            41
## 5      5            41
## # A tibble: 2 x 2
##   Recommended.IND `median(Age)`
##             <int>         <dbl>
## 1               0            40
## 2               1            41
## # A tibble: 4 x 2
##   Division.Name  `median(Age)`
##   <chr>                  <dbl>
## 1 General                   41
## 2 General Petite            41
## 3 Initmates                 39
## 4 <NA>                      38
## # A tibble: 7 x 2
##   Department.Name `median(Age)`
##   <chr>                   <dbl>
## 1 Bottoms                    41
## 2 Dresses                    40
## 3 Intimate                   39
## 4 Jackets                    42
## 5 Tops                       42
## 6 Trend                      43
## 7 <NA>                       38
## # A tibble: 7 x 2
##   Department.Name `median(Age)`
##   <chr>                   <dbl>
## 1 Bottoms                    41
## 2 Dresses                    40
## 3 Intimate                   39
## 4 Jackets                    42
## 5 Tops                       42
## 6 Trend                      43
## 7 <NA>                       38

Now using only values from corpus/Title variable and few Engineered variable to create models.

TitleSparse$Positive.Feedback.Count<-NULL
TitleSparse$Recommended.IND<-as.factor(data$Recommended.IND)
TitleSparse$containsdots<-ifelse(grepl("\\...",data$Title),1,0)
TitleSparse$containsexclamation<-ifelse(grepl("\\!",data$Title),1,0)
TitleSparse$containsquestionmark<-ifelse(grepl("\\?",data$Title),1,0)

explore<-TitleSparse %>% group_by(containsdots) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
##   containsdots `mean(as.numeric(Recommended.IND) - 1)`
##          <dbl>                                   <dbl>
## 1            0                                   0.829
## 2            1                                   0.605
explore<-TitleSparse %>% group_by(containsexclamation) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
##   containsexclamation `mean(as.numeric(Recommended.IND) - 1)`
##                 <dbl>                                   <dbl>
## 1                   0                                   0.800
## 2                   1                                   0.924
explore<-TitleSparse %>% group_by(containsquestionmark) %>% summarise(mean(as.numeric(Recommended.IND)-1))
print(explore)
## # A tibble: 2 x 2
##   containsquestionmark `mean(as.numeric(Recommended.IND) - 1)`
##                  <dbl>                                   <dbl>
## 1                    0                                   0.823
## 2                    1                                   0.669
library(caTools)
splits<-sample.split(TitleSparse,SplitRatio = 0.8) 
train<-subset(TitleSparse,splits==TRUE)
test<-subset(TitleSparse,splits==FALSE)

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(caret)
## Loading required package: lattice
output.tree <- train(factor(Recommended.IND)~.,data=train,method="ctree",trControl=trainControl(method="cv",number = 5))
predictionsctree<-predict(output.tree,test,type="prob")[,2]


predictionsctreetrain<-predict(output.tree,train,type="prob")[,2]
plot(output.tree)

print(output.tree)
## Conditional Inference Tree 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14704, 14704, 14705, 14706, 14705 
## Resampling results across tuning parameters:
## 
##   mincriterion  Accuracy   Kappa    
##   0.01          0.8373859  0.2525123
##   0.50          0.8373315  0.2477633
##   0.99          0.8369510  0.2203518
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.01.
modelglm<-train(factor(Recommended.IND)~.,data=train,method="glm",trControl=trainControl(method="cv",number = 5))
predictionsGLM<-data.frame(predict(modelglm,test,type="prob"))
predictionsGLM<-predictionsGLM[,2]


predictionsGLMtrain<-data.frame(predict(modelglm,train,type="prob"))
predictionsGLMtrain<-predictionsGLMtrain[,2]
print(table(predictionsGLM>0.5,test$Recommended.IND))
##        
##            0    1
##   FALSE  145   96
##   TRUE   748 4116
print(modelglm)
## Generalized Linear Model 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14704, 14704, 14706, 14705, 14705 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8339044  0.1954241
modelgbm<-modelglm<-train(factor(Recommended.IND)~.,data=train,method="gbm",trControl=trainControl(method="cv",number = 5))
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9288             nan     0.1000    0.0048
##      2        0.9223             nan     0.1000    0.0035
##      3        0.9172             nan     0.1000    0.0026
##      4        0.9133             nan     0.1000    0.0018
##      5        0.9079             nan     0.1000    0.0027
##      6        0.9044             nan     0.1000    0.0016
##      7        0.9018             nan     0.1000    0.0014
##      8        0.8994             nan     0.1000    0.0011
##      9        0.8970             nan     0.1000    0.0011
##     10        0.8940             nan     0.1000    0.0015
##     20        0.8783             nan     0.1000    0.0004
##     40        0.8564             nan     0.1000    0.0005
##     60        0.8415             nan     0.1000    0.0001
##     80        0.8298             nan     0.1000    0.0002
##    100        0.8214             nan     0.1000    0.0002
##    120        0.8131             nan     0.1000    0.0002
##    140        0.8072             nan     0.1000    0.0001
##    150        0.8053             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9220             nan     0.1000    0.0077
##      2        0.9117             nan     0.1000    0.0050
##      3        0.9047             nan     0.1000    0.0033
##      4        0.8984             nan     0.1000    0.0030
##      5        0.8934             nan     0.1000    0.0025
##      6        0.8893             nan     0.1000    0.0022
##      7        0.8858             nan     0.1000    0.0015
##      8        0.8830             nan     0.1000    0.0010
##      9        0.8802             nan     0.1000    0.0013
##     10        0.8766             nan     0.1000    0.0016
##     20        0.8558             nan     0.1000    0.0009
##     40        0.8279             nan     0.1000    0.0005
##     60        0.8115             nan     0.1000    0.0000
##     80        0.7994             nan     0.1000    0.0002
##    100        0.7903             nan     0.1000    0.0001
##    120        0.7840             nan     0.1000    0.0001
##    140        0.7787             nan     0.1000    0.0000
##    150        0.7768             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9208             nan     0.1000    0.0087
##      2        0.9091             nan     0.1000    0.0060
##      3        0.8990             nan     0.1000    0.0051
##      4        0.8913             nan     0.1000    0.0039
##      5        0.8850             nan     0.1000    0.0032
##      6        0.8806             nan     0.1000    0.0020
##      7        0.8762             nan     0.1000    0.0020
##      8        0.8720             nan     0.1000    0.0021
##      9        0.8677             nan     0.1000    0.0018
##     10        0.8646             nan     0.1000    0.0014
##     20        0.8392             nan     0.1000    0.0005
##     40        0.8108             nan     0.1000    0.0005
##     60        0.7927             nan     0.1000    0.0002
##     80        0.7816             nan     0.1000    0.0000
##    100        0.7746             nan     0.1000    0.0001
##    120        0.7703             nan     0.1000   -0.0001
##    140        0.7669             nan     0.1000   -0.0001
##    150        0.7650             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9276             nan     0.1000    0.0048
##      2        0.9204             nan     0.1000    0.0036
##      3        0.9151             nan     0.1000    0.0028
##      4        0.9089             nan     0.1000    0.0031
##      5        0.9049             nan     0.1000    0.0020
##      6        0.9019             nan     0.1000    0.0014
##      7        0.8979             nan     0.1000    0.0014
##      8        0.8953             nan     0.1000    0.0011
##      9        0.8927             nan     0.1000    0.0011
##     10        0.8909             nan     0.1000    0.0008
##     20        0.8731             nan     0.1000    0.0004
##     40        0.8533             nan     0.1000    0.0005
##     60        0.8389             nan     0.1000    0.0003
##     80        0.8275             nan     0.1000    0.0001
##    100        0.8184             nan     0.1000    0.0002
##    120        0.8112             nan     0.1000    0.0001
##    140        0.8049             nan     0.1000    0.0001
##    150        0.8023             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9212             nan     0.1000    0.0076
##      2        0.9108             nan     0.1000    0.0054
##      3        0.9024             nan     0.1000    0.0040
##      4        0.8959             nan     0.1000    0.0029
##      5        0.8913             nan     0.1000    0.0023
##      6        0.8873             nan     0.1000    0.0018
##      7        0.8830             nan     0.1000    0.0021
##      8        0.8791             nan     0.1000    0.0017
##      9        0.8761             nan     0.1000    0.0013
##     10        0.8732             nan     0.1000    0.0013
##     20        0.8524             nan     0.1000    0.0009
##     40        0.8254             nan     0.1000    0.0001
##     60        0.8087             nan     0.1000    0.0001
##     80        0.7970             nan     0.1000    0.0001
##    100        0.7885             nan     0.1000    0.0000
##    120        0.7816             nan     0.1000   -0.0000
##    140        0.7764             nan     0.1000    0.0002
##    150        0.7746             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9191             nan     0.1000    0.0089
##      2        0.9055             nan     0.1000    0.0065
##      3        0.8968             nan     0.1000    0.0044
##      4        0.8886             nan     0.1000    0.0039
##      5        0.8820             nan     0.1000    0.0032
##      6        0.8770             nan     0.1000    0.0023
##      7        0.8726             nan     0.1000    0.0020
##      8        0.8688             nan     0.1000    0.0020
##      9        0.8655             nan     0.1000    0.0015
##     10        0.8615             nan     0.1000    0.0019
##     20        0.8369             nan     0.1000    0.0006
##     40        0.8072             nan     0.1000    0.0004
##     60        0.7899             nan     0.1000    0.0002
##     80        0.7799             nan     0.1000    0.0001
##    100        0.7725             nan     0.1000    0.0002
##    120        0.7680             nan     0.1000   -0.0001
##    140        0.7644             nan     0.1000   -0.0000
##    150        0.7631             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9282             nan     0.1000    0.0044
##      2        0.9214             nan     0.1000    0.0033
##      3        0.9166             nan     0.1000    0.0025
##      4        0.9130             nan     0.1000    0.0016
##      5        0.9075             nan     0.1000    0.0030
##      6        0.9048             nan     0.1000    0.0011
##      7        0.9009             nan     0.1000    0.0019
##      8        0.8977             nan     0.1000    0.0013
##      9        0.8953             nan     0.1000    0.0011
##     10        0.8931             nan     0.1000    0.0010
##     20        0.8772             nan     0.1000    0.0004
##     40        0.8572             nan     0.1000    0.0002
##     60        0.8403             nan     0.1000    0.0003
##     80        0.8299             nan     0.1000    0.0002
##    100        0.8201             nan     0.1000    0.0002
##    120        0.8126             nan     0.1000    0.0001
##    140        0.8065             nan     0.1000   -0.0000
##    150        0.8034             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9212             nan     0.1000    0.0075
##      2        0.9108             nan     0.1000    0.0047
##      3        0.9034             nan     0.1000    0.0039
##      4        0.8974             nan     0.1000    0.0028
##      5        0.8925             nan     0.1000    0.0026
##      6        0.8885             nan     0.1000    0.0018
##      7        0.8854             nan     0.1000    0.0016
##      8        0.8824             nan     0.1000    0.0014
##      9        0.8791             nan     0.1000    0.0014
##     10        0.8760             nan     0.1000    0.0014
##     20        0.8549             nan     0.1000    0.0007
##     40        0.8288             nan     0.1000    0.0004
##     60        0.8115             nan     0.1000    0.0002
##     80        0.7990             nan     0.1000    0.0000
##    100        0.7903             nan     0.1000    0.0001
##    120        0.7846             nan     0.1000    0.0000
##    140        0.7788             nan     0.1000   -0.0000
##    150        0.7770             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9193             nan     0.1000    0.0092
##      2        0.9070             nan     0.1000    0.0060
##      3        0.8972             nan     0.1000    0.0046
##      4        0.8903             nan     0.1000    0.0033
##      5        0.8850             nan     0.1000    0.0025
##      6        0.8797             nan     0.1000    0.0026
##      7        0.8750             nan     0.1000    0.0020
##      8        0.8707             nan     0.1000    0.0019
##      9        0.8682             nan     0.1000    0.0011
##     10        0.8648             nan     0.1000    0.0015
##     20        0.8396             nan     0.1000    0.0009
##     40        0.8091             nan     0.1000    0.0004
##     60        0.7924             nan     0.1000    0.0003
##     80        0.7815             nan     0.1000    0.0002
##    100        0.7742             nan     0.1000    0.0001
##    120        0.7698             nan     0.1000   -0.0001
##    140        0.7668             nan     0.1000   -0.0000
##    150        0.7655             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9294             nan     0.1000    0.0042
##      2        0.9239             nan     0.1000    0.0023
##      3        0.9176             nan     0.1000    0.0029
##      4        0.9134             nan     0.1000    0.0020
##      5        0.9100             nan     0.1000    0.0016
##      6        0.9065             nan     0.1000    0.0015
##      7        0.9038             nan     0.1000    0.0012
##      8        0.9014             nan     0.1000    0.0012
##      9        0.8994             nan     0.1000    0.0010
##     10        0.8975             nan     0.1000    0.0009
##     20        0.8825             nan     0.1000    0.0008
##     40        0.8625             nan     0.1000    0.0004
##     60        0.8474             nan     0.1000    0.0003
##     80        0.8366             nan     0.1000    0.0003
##    100        0.8286             nan     0.1000    0.0002
##    120        0.8212             nan     0.1000   -0.0000
##    140        0.8157             nan     0.1000    0.0002
##    150        0.8127             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9243             nan     0.1000    0.0072
##      2        0.9136             nan     0.1000    0.0048
##      3        0.9067             nan     0.1000    0.0030
##      4        0.9023             nan     0.1000    0.0021
##      5        0.8977             nan     0.1000    0.0024
##      6        0.8941             nan     0.1000    0.0018
##      7        0.8903             nan     0.1000    0.0015
##      8        0.8872             nan     0.1000    0.0015
##      9        0.8845             nan     0.1000    0.0014
##     10        0.8818             nan     0.1000    0.0013
##     20        0.8620             nan     0.1000    0.0004
##     40        0.8367             nan     0.1000    0.0004
##     60        0.8198             nan     0.1000    0.0005
##     80        0.8079             nan     0.1000   -0.0001
##    100        0.7991             nan     0.1000    0.0002
##    120        0.7920             nan     0.1000    0.0002
##    140        0.7872             nan     0.1000    0.0001
##    150        0.7854             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9211             nan     0.1000    0.0075
##      2        0.9105             nan     0.1000    0.0057
##      3        0.9015             nan     0.1000    0.0040
##      4        0.8944             nan     0.1000    0.0035
##      5        0.8890             nan     0.1000    0.0025
##      6        0.8842             nan     0.1000    0.0023
##      7        0.8801             nan     0.1000    0.0020
##      8        0.8766             nan     0.1000    0.0014
##      9        0.8740             nan     0.1000    0.0010
##     10        0.8707             nan     0.1000    0.0014
##     20        0.8464             nan     0.1000    0.0010
##     40        0.8179             nan     0.1000    0.0005
##     60        0.8003             nan     0.1000    0.0004
##     80        0.7906             nan     0.1000    0.0001
##    100        0.7842             nan     0.1000    0.0001
##    120        0.7795             nan     0.1000    0.0001
##    140        0.7752             nan     0.1000    0.0000
##    150        0.7737             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9291             nan     0.1000    0.0048
##      2        0.9223             nan     0.1000    0.0031
##      3        0.9178             nan     0.1000    0.0022
##      4        0.9144             nan     0.1000    0.0016
##      5        0.9099             nan     0.1000    0.0023
##      6        0.9066             nan     0.1000    0.0015
##      7        0.9036             nan     0.1000    0.0012
##      8        0.9015             nan     0.1000    0.0009
##      9        0.8986             nan     0.1000    0.0014
##     10        0.8963             nan     0.1000    0.0011
##     20        0.8813             nan     0.1000    0.0003
##     40        0.8606             nan     0.1000    0.0002
##     60        0.8451             nan     0.1000    0.0003
##     80        0.8337             nan     0.1000    0.0002
##    100        0.8251             nan     0.1000    0.0002
##    120        0.8177             nan     0.1000    0.0001
##    140        0.8116             nan     0.1000    0.0001
##    150        0.8089             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9243             nan     0.1000    0.0069
##      2        0.9148             nan     0.1000    0.0046
##      3        0.9077             nan     0.1000    0.0037
##      4        0.9020             nan     0.1000    0.0026
##      5        0.8967             nan     0.1000    0.0026
##      6        0.8934             nan     0.1000    0.0016
##      7        0.8896             nan     0.1000    0.0016
##      8        0.8864             nan     0.1000    0.0014
##      9        0.8835             nan     0.1000    0.0014
##     10        0.8806             nan     0.1000    0.0015
##     20        0.8598             nan     0.1000    0.0003
##     40        0.8331             nan     0.1000    0.0004
##     60        0.8144             nan     0.1000    0.0004
##     80        0.8031             nan     0.1000    0.0001
##    100        0.7947             nan     0.1000   -0.0000
##    120        0.7876             nan     0.1000   -0.0000
##    140        0.7827             nan     0.1000   -0.0001
##    150        0.7804             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9213             nan     0.1000    0.0077
##      2        0.9092             nan     0.1000    0.0056
##      3        0.9003             nan     0.1000    0.0044
##      4        0.8936             nan     0.1000    0.0033
##      5        0.8878             nan     0.1000    0.0028
##      6        0.8834             nan     0.1000    0.0023
##      7        0.8796             nan     0.1000    0.0015
##      8        0.8753             nan     0.1000    0.0020
##      9        0.8722             nan     0.1000    0.0014
##     10        0.8688             nan     0.1000    0.0015
##     20        0.8433             nan     0.1000    0.0009
##     40        0.8132             nan     0.1000    0.0005
##     60        0.7957             nan     0.1000    0.0002
##     80        0.7852             nan     0.1000    0.0002
##    100        0.7776             nan     0.1000   -0.0000
##    120        0.7723             nan     0.1000    0.0001
##    140        0.7691             nan     0.1000   -0.0000
##    150        0.7682             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9217             nan     0.1000    0.0087
##      2        0.9092             nan     0.1000    0.0063
##      3        0.8998             nan     0.1000    0.0046
##      4        0.8924             nan     0.1000    0.0035
##      5        0.8867             nan     0.1000    0.0029
##      6        0.8820             nan     0.1000    0.0023
##      7        0.8773             nan     0.1000    0.0022
##      8        0.8740             nan     0.1000    0.0015
##      9        0.8704             nan     0.1000    0.0017
##     10        0.8678             nan     0.1000    0.0011
##     20        0.8425             nan     0.1000    0.0011
##     40        0.8134             nan     0.1000    0.0006
##     50        0.8043             nan     0.1000    0.0002
predictionsGBM<-predict(modelgbm,test,type="prob")[,2]
predictionsGBMtrain<-predict(modelgbm,train,type="prob")[,2]
plot(modelgbm)

print(modelgbm)
## Stochastic Gradient Boosting 
## 
## 18381 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 14706, 14704, 14705, 14705, 14704 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8328709  0.1778144
##   1                  100      0.8337414  0.1940634
##   1                  150      0.8336326  0.1967619
##   2                   50      0.8347205  0.1999170
##   2                  100      0.8341767  0.1987011
##   2                  150      0.8338504  0.2111095
##   3                   50      0.8361895  0.2093033
##   3                  100      0.8348840  0.2173995
##   3                  150      0.8360809  0.2295391
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth
##  = 3, shrinkage = 0.1 and n.minobsinnode = 10.
library("xgboost")
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library("Matrix")
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
library(dplyr)
train$Recommended.IND<-as.numeric(train$Recommended.IND)-1

data_variables <- as.matrix(train %>% select(-Recommended.IND))
data_label <- train[,"Recommended.IND"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)

numberOfClasses <- length(unique(train$Recommended.IND))
xgb_params <- list(booster = "gbtree","objective" = "binary:logistic",eta=0.1,gamma=5,max_depth=10)
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 100, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = F)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1]  train-error:0.159108+0.001024   test-error:0.162288+0.006938 
## Multiple eval metrics are present. Will use test_error for early stopping.
## Will train until test_error hasn't improved in 20 rounds.
## 
## [11] train-error:0.160347+0.000871   test-error:0.163104+0.008183 
## [21] train-error:0.159960+0.000995   test-error:0.162179+0.007848 
## [31] train-error:0.159489+0.001176   test-error:0.162071+0.007700 
## [41] train-error:0.158237+0.000882   test-error:0.160819+0.007373 
## [51] train-error:0.158080+0.000961   test-error:0.160874+0.007085 
## [61] train-error:0.158080+0.000849   test-error:0.160819+0.007199 
## Stopping. Best iteration:
## [44] train-error:0.158159+0.000906   test-error:0.160547+0.007129
nround    <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold  <- 10

# Fit cv.nfold * cv.nround XGB models and save OOF predictions
bst_model <- xgb.train(params = xgb_params,
                       data = data_matrix,
                       nrounds = nround)

test$Recommended.IND<-as.numeric(test$Recommended.IND)-1


test_matrix<-xgb.DMatrix(data = as.matrix(test %>% select(-Recommended.IND)))
predictionsXGBoost<-predict(bst_model,newdata=test_matrix)

predictionsXGBoosttrain<-predict(bst_model,newdata=data_matrix)
print(table(predictionsXGBoost>0.5,test$Recommended.IND))
##        
##            0    1
##   FALSE  165  104
##   TRUE   728 4108
#Stacking model modelGLM,bst_model(XGBOOST) and ctree and using linear model on top
df<-data.frame(train,a=predictionsGLMtrain,b=predictionsGBMtrain,c=predictionsXGBoosttrain,d=predictionsctreetrain)


library(class)

testdf<-data.frame(test,a=predictionsGLM,b=predictionsGBM,c=predictionsXGBoost,d=predictionsctree)


model<-lm(Recommended.IND~.,data=df)


predictions<-predict(model,testdf)
table(predictions>0.5,test$Recommended.IND)
##        
##            0    1
##   FALSE  195  130
##   TRUE   698 4082
##Accuracy increased by stacking models

Future work which can increase accuracy-

Using more terms from corpus may increase the accuracy(but space complexity will increase a lot). The cactegorical values can be encoded by mean or frequency which can increase the accuracy as well. Stacking more model and bagging can also help. You can also use N-grams but will take up more space and time to build models.

Here I have achieved an accuracy of ~84% using 22 variables(including 3 Engineered variables) which is pretty good considering the space and time of the CPU used.